Input Package and Library

library(corrplot)

## corrplot 0.92 loaded

library(ggplot2)
library(car)

## Loading required package: carData

library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

lapply(c("tidyverse","rvest","kableExtra"),library,character.only=T)[[1]]

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'rvest'
## 
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
## 
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

##  [1] "lubridate" "forcats"   "stringr"   "purrr"     "readr"     "tidyr"    
##  [7] "tibble"    "tidyverse" "dplyr"     "olsrr"     "car"       "carData"  
## [13] "ggplot2"   "corrplot"  "stats"     "graphics"  "grDevices" "utils"    
## [19] "datasets"  "methods"   "base"

lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
## 
## Attaching package: 'lmridge'
## 
## The following object is masked from 'package:car':
## 
##     vif

##  [1] "glmnet"     "Matrix"     "kableExtra" "rvest"      "lubridate" 
##  [6] "forcats"    "stringr"    "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "tidyverse"  "dplyr"      "olsrr"      "car"       
## [16] "carData"    "ggplot2"    "corrplot"   "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"

Input Data

library(readxl)
data<-read_excel("/Users/aisyahnuruzzahra/Downloads/data pokemon.xlsx")
str(data)

## tibble [75 × 4] (S3: tbl_df/tbl/data.frame)
##  $ total_strength: num [1:75] 327 429 569 363 472 612 368 473 613 285 ...
##  $ hp            : num [1:75] 45 60 80 39 58 78 44 59 79 45 ...
##  $ height        : num [1:75] 7 10 20 6 11 17 5 10 16 3 ...
##  $ weight        : num [1:75] 69 130 1000 85 190 905 90 225 855 29 ...

y<-data$`total_strength`
x1<-data$`hp`
x2<-data$`height`
x3<-data$`weight`
data1 = data.frame(y,x1,x2,x3)
x <- data.matrix(data1[,c('x1', 'x2', 'x3')])
head(data1)

##     y x1 x2   x3
## 1 327 45  7   69
## 2 429 60 10  130
## 3 569 80 20 1000
## 4 363 39  6   85
## 5 472 58 11  190
## 6 612 78 17  905

Eksplorasi Data

data1_numeric <- data1[sapply(data1, is.numeric)]
cor_matrix <- cor(data1_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "number")

# Plot antara Y dengan X1

plot(x=data$hp, y=data$total_strength, main="Hubungan HP Pokemonterhadap Total Strength")

Plot antara Y dengan X2

plot(x=data$height, y=data$total_strength, main="Hubungan Height Pokemon terhadap Total Strength")

# Plot antara Y dengan X3

plot(x=data$weight, y=data$total_strength, main="Hubungan Weight Pokemon terhadap Total Strength")

## Model Regresi Klasik

modelklasik <- lm(data$total_strength~data$hp+data$height+data$weight,data=data1)
s.modelklasik <- summary(modelklasik)
s.modelklasik

## 
## Call:
## lm(formula = data$total_strength ~ data$hp + data$height + data$weight, 
##     data = data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -188.410  -56.933   -4.719   54.854  157.892 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 291.8696    27.6919  10.540 3.61e-16 ***
## data$hp       0.8407     0.4865   1.728  0.08834 .  
## data$height   5.5378     2.2731   2.436  0.01735 *  
## data$weight   0.1304     0.0389   3.352  0.00129 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.07 on 71 degrees of freedom
## Multiple R-squared:  0.5025, Adjusted R-squared:  0.4815 
## F-statistic:  23.9 on 3 and 71 DF,  p-value: 8.421e-11

Model regresi klasik yang diperoleh adalah : \[total strength=291.8696+0.8407hp+ 5.5378 height+0.1304 weight\]

R Squared dan RSE Model Klasik

R squared

R2.modelklasik <- s.modelklasik$r.squared
R2.modelklasik

## [1] 0.5024762

# RSE
rse.modelklasik <- s.modelklasik$sigma
rse.modelklasik

## [1] 76.07345

Rsq Model klasik didapatkan $ R^2 =0.5025$ dan nilai RSE $RSE = 76.07345 $

Plot Pencilan dan Leverage

ols_plot_resid_lev(modelklasik)

plot(modelklasik,which=5)
ols_plot_diagnostics(modelklasik)

Uji Asumsi

1 Nilai harapan sisaan sama dengan nol

t.test(modelklasik$residuals,
       mu = 0,
       conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  modelklasik$residuals
## t = 6.6064e-16, df = 74, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -17.14445  17.14445
## sample estimates:
##    mean of x 
## 5.684342e-15

2 Sisaan saling bebas

lmtest::dwtest(modelklasik)

## 
##  Durbin-Watson test
## 
## data:  modelklasik
## DW = 1.7261, p-value = 0.1069
## alternative hypothesis: true autocorrelation is greater than 0

lmtest::bptest(modelklasik)

## 
##  studentized Breusch-Pagan test
## 
## data:  modelklasik
## BP = 5.7814, df = 3, p-value = 0.1227

Asumsi Normalitas Sisaan

shapiro.test(modelklasik$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  modelklasik$residuals
## W = 0.98549, p-value = 0.5491

Multikolineritas

library(car)
car::vif(modelklasik)

##     data$hp data$height data$weight 
##    1.359496    1.930949    1.853587

Tidak ada multikolinieritas antarvariabel karena nilai VIF<10.

Regresi Ridge

x <- data.matrix(data[,c('hp','height' ,'weight')])
y <- data$total_strength
cv.r<-cv.glmnet(x,y,alpha=0)

best.lr<-cv.r$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=best.lr)
coef(bestridge)

## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 309.0841489
## hp            0.8105141
## height        4.8718180
## weight        0.1008961

Didapatkan Model regresi Ridge ialah : \[total strength=310.69575579+0.80424856hp+4.80558228height+0.09894507weight\]

R Squared dan RSE Model Ridge

fungsi untuk cari rsquared

rsq<-function(bestmodel,bestlambda,x,y){
 #y duga
 y.duga <- predict(bestmodel, s = bestlambda, newx = x)

 #JKG dan JKT
 jkt <- sum((y - mean(y))^2)
 jkg <- sum((y.duga- y)^2)

#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq) 
}

# R2 ridge
R2.modelridge <- rsq(bestridge,best.lr,x,y)
R2.modelridge

## [1] 0.4892223

#RSE ridge
train_predictionsr <- predict(bestridge,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsr <- y - train_predictionsr
# Hitung varian residu
dfr <- length(y) - length(bestridge$beta)
residual_variancer <- sum(residualsr^2) / dfr
# Hitung RSE
rse.modelridge <- sqrt(residual_variancer)
rse.modelridge

## [1] 76.54293

Didapatkan nilai R2 pada model ridge $ R^2 = 0.4870686$ dan RSE model ridge didapatkan nilai $RSE =76.70413 $

Regresi Lasso

cv.l<-cv.glmnet(x,y,alpha=1)
best.ll<-cv.l$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll)
coef(bestlasso)

## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 302.8036074
## hp            0.7334256
## height        5.2528910
## weight        0.1246989

Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+ 0.1274376weight\]

R Squared dan RSE Model Lasso

# R2 lasso
R2.modellasso <- rsq(bestlasso,best.ll,x,y)
R2.modellasso

## [1] 0.5005677

#RSElasso
train_predictionsLasso <- predict(bestlasso,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsLasso <- y - train_predictionsLasso
# Hitung varian residu
dfLasso <- length(y) - length(bestlasso$beta)
residual_varianceLasso <- sum(residualsLasso^2) / dfLasso
# Hitung RSE
rse.modelLasso <- sqrt(residual_varianceLasso)
rse.modelLasso

## [1] 75.68807

Didapatkan nilai R2 pada model lasso $ R^2 = 0.5019571$ dan RSE model ridge didapatkan nilai $RSE =75.58272 $

Perbandingan Model Regresi

perbandingan <- matrix(c(R2.modelklasik, R2.modelridge, R2.modellasso, rse.modelklasik, rse.modelridge, rse.modelLasso),ncol=2,byrow = F)
row.names(perbandingan)<- c("Model klasik","model ridge","model lasso")
colnames(perbandingan) <- c("R squared","RSE")
perbandingan

##              R squared      RSE
## Model klasik 0.5024762 76.07345
## model ridge  0.4892223 76.54293
## model lasso  0.5005677 75.68807

Perbandingan dengan melihat nilai RSE yang memiliki nilai terendah, maka model yang memiliki nilai RSE terendah ialah model lasso dengan nilai $ RSE = 75.58272$ dapat dikatakan bahwa model terbaiknya adalah model lasso

Model Terbaik (Model Lasso)

coef(bestlasso)

## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 302.8036074
## hp            0.7334256
## height        5.2528910
## weight        0.1246989

Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+0.1274376weight\] ## Interpretasi Koefesien Model total strength akan meningkat jika height, weight dan hp meningkat. akan tetapi, pengaruh peningkatan tertinggi pada total strength terdapat pada height.

Pengaruh Height, Weight, dan HP terhadap Total Strength Pokemon

Aisyah Nuruzzahra Tirtasuwanda

2023-10-06