Memanggil Package

library(rsample)
library(DataExplorer)
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(openxlsx)

IMPORT DATA

Kita harus melakukan import data yang akan kita kerjakan terlebih dahulu

library(readr)
insurance_1_ <- read_csv("C:/Users/sarah/Downloads/insurance (1).csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, expenses
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(insurance_1_)

Kemudian kita mengubah nama file menjadi data_insurance

data_insurance <- read.csv("C:/Users/sarah/Downloads/insurance (1).csv",stringsAsFactors = TRUE)
head(data_insurance)
##   age    sex    bmi children smoker    region  expenses
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622

Data Exploration

Suatu proses uji investigasi awal yang bertujuan untuk mengidentifikasi pola, menemukan anomali, menguji hipotesis dan memeriksa asumsi.

1. Memeriksa sebaran Data

Sebaran untuk peubah numerik

plot_histogram(data = data_insurance,nrow=3,ncol = 3, geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Transformasi Response

data_insurance$expenses <- log(data_insurance$expenses)

plot_histogram(data = data_insurance,nrow=3,ncol = 3, geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Sebaran untuk peubah kategorik

plot_bar(data = data_insurance,nrow=3,ncol = 3)

2.Memeriksa Korelasi Peubah

plot_scatterplot(data = data_insurance[,c("expenses","age","bmi","children")], by="expenses",geom_point_args= list(color="steelblue") )

Model Regresi Linear

regresi <- lm(formula = expenses~.,data = data_insurance)
summary(regresi)
## 
## Call:
## lm(formula = expenses ~ ., data = data_insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07186 -0.19835 -0.04917  0.06598  2.16636 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0305581  0.0723960  97.112  < 2e-16 ***
## age              0.0345816  0.0008721  39.655  < 2e-16 ***
## sexmale         -0.0754164  0.0244012  -3.091 0.002038 ** 
## bmi              0.0133748  0.0020960   6.381 2.42e-10 ***
## children         0.1018568  0.0100995  10.085  < 2e-16 ***
## smokeryes        1.5543228  0.0302795  51.333  < 2e-16 ***
## regionnorthwest -0.0637876  0.0349057  -1.827 0.067860 .  
## regionsoutheast -0.1571967  0.0350828  -4.481 8.08e-06 ***
## regionsouthwest -0.1289522  0.0350271  -3.681 0.000241 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.7666 
## F-statistic: 549.8 on 8 and 1329 DF,  p-value: < 2.2e-16

Plot model regresi

plot_model(regresi,type = "est",sort.est = TRUE, transform = "exp" )

plot_model(model = regresi,type="pred")
## $age

## 
## $sex

## 
## $bmi

## 
## $children

## 
## $smoker

## 
## $region

Model Checking

plot_model(regresi,type = "diag")
## [[1]]

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

res <- residuals(regresi)
# uji normalitas
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.8373, p-value < 2.2e-16
fBasics::jarqueberaTest(res)
## 
## Title:
##  Jarque - Bera Normalality Test
## 
## Test Results:
##   STATISTIC:
##     X-squared: 1673.7604
##   P VALUE:
##     Asymptotic p Value: < 2.2e-16 
## 
## Description:
##  Fri Feb 24 15:05:19 2023 by user: sarah
fBasics::ksnormTest(res,)
## Warning in ks.test.default(x, "pnorm", alternative = "two.sided"): ties should
## not be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "less"): ties should not be
## present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "greater"): ties should not
## be present for the Kolmogorov-Smirnov test
## 
## Title:
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## Test Results:
##   STATISTIC:
##     D: 0.2831
##   P VALUE:
##     Alternative Two-Sided: < 2.2e-16 
##     Alternative      Less: < 2.2e-16 
##     Alternative   Greater: < 2.2e-16 
## 
## Description:
##  Fri Feb 24 15:05:19 2023 by user: sarah
print(fBasics::adTest(res))
## 
## Title:
##  Anderson - Darling Normality Test
## 
## Test Results:
##   STATISTIC:
##     A: 74.8074
##   P VALUE:
##     < 2.2e-16 
## 
## Description:
##  Fri Feb 24 15:05:19 2023 by user: sarah

Uji Homogen Ragam

lmtest::bptest(expenses ~.,data = data_insurance, studentize = F)
## 
##  Breusch-Pagan test
## 
## data:  expenses ~ .
## BP = 243.98, df = 8, p-value < 2.2e-16

Prediksi Regresi Linear

membagi data menjadi training testing

set.seed(123)
data_split <- initial_split(data = data_insurance,prop = 0.8)
                            
train1 <- training(data_split)
test1 <- testing(data_split)

regresi2 <- lm(expenses ~.,data = train1)

Prediksi Data Testing

prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
##       14       15       21       22       27       33 
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506

Evaluasi Data Testing

prediksi <- predict(regresi2,newdata = test1)
prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
##       14       15       21       22       27       33 
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506

Evaluasi Hasil Prediksi

RMSE

mlr3measures::rmse(response = prediksi,truth = test1$expenses)
## [1] 0.4415859

MAPE

mlr3measures::mape(response = prediksi,truth = test1$expenses)
## [1] 0.03175772

Spearman Correlation

mlr3measures::srho(response = prediksi,truth = test1$expenses)
## [1] 0.9039972

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.