library(rsample)
library(DataExplorer)
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(openxlsx)
Kita harus melakukan import data yang akan kita kerjakan terlebih dahulu
library(readr)
insurance_1_ <- read_csv("C:/Users/sarah/Downloads/insurance (1).csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, expenses
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(insurance_1_)
Kemudian kita mengubah nama file menjadi data_insurance
data_insurance <- read.csv("C:/Users/sarah/Downloads/insurance (1).csv",stringsAsFactors = TRUE)
head(data_insurance)
## age sex bmi children smoker region expenses
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
Suatu proses uji investigasi awal yang bertujuan untuk mengidentifikasi pola, menemukan anomali, menguji hipotesis dan memeriksa asumsi.
Sebaran untuk peubah numerik
plot_histogram(data = data_insurance,nrow=3,ncol = 3, geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data_insurance$expenses <- log(data_insurance$expenses)
plot_histogram(data = data_insurance,nrow=3,ncol = 3, geom_histogram_args = list(fill="steelblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_bar(data = data_insurance,nrow=3,ncol = 3)
plot_scatterplot(data = data_insurance[,c("expenses","age","bmi","children")], by="expenses",geom_point_args= list(color="steelblue") )
regresi <- lm(formula = expenses~.,data = data_insurance)
summary(regresi)
##
## Call:
## lm(formula = expenses ~ ., data = data_insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07186 -0.19835 -0.04917 0.06598 2.16636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0305581 0.0723960 97.112 < 2e-16 ***
## age 0.0345816 0.0008721 39.655 < 2e-16 ***
## sexmale -0.0754164 0.0244012 -3.091 0.002038 **
## bmi 0.0133748 0.0020960 6.381 2.42e-10 ***
## children 0.1018568 0.0100995 10.085 < 2e-16 ***
## smokeryes 1.5543228 0.0302795 51.333 < 2e-16 ***
## regionnorthwest -0.0637876 0.0349057 -1.827 0.067860 .
## regionsoutheast -0.1571967 0.0350828 -4.481 8.08e-06 ***
## regionsouthwest -0.1289522 0.0350271 -3.681 0.000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4443 on 1329 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.7666
## F-statistic: 549.8 on 8 and 1329 DF, p-value: < 2.2e-16
plot_model(regresi,type = "est",sort.est = TRUE, transform = "exp" )
plot_model(model = regresi,type="pred")
## $age
##
## $sex
##
## $bmi
##
## $children
##
## $smoker
##
## $region
plot_model(regresi,type = "diag")
## [[1]]
##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
res <- residuals(regresi)
# uji normalitas
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.8373, p-value < 2.2e-16
fBasics::jarqueberaTest(res)
##
## Title:
## Jarque - Bera Normalality Test
##
## Test Results:
## STATISTIC:
## X-squared: 1673.7604
## P VALUE:
## Asymptotic p Value: < 2.2e-16
##
## Description:
## Fri Feb 24 15:05:19 2023 by user: sarah
fBasics::ksnormTest(res,)
## Warning in ks.test.default(x, "pnorm", alternative = "two.sided"): ties should
## not be present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "less"): ties should not be
## present for the Kolmogorov-Smirnov test
## Warning in ks.test.default(x, "pnorm", alternative = "greater"): ties should not
## be present for the Kolmogorov-Smirnov test
##
## Title:
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## Test Results:
## STATISTIC:
## D: 0.2831
## P VALUE:
## Alternative Two-Sided: < 2.2e-16
## Alternative Less: < 2.2e-16
## Alternative Greater: < 2.2e-16
##
## Description:
## Fri Feb 24 15:05:19 2023 by user: sarah
print(fBasics::adTest(res))
##
## Title:
## Anderson - Darling Normality Test
##
## Test Results:
## STATISTIC:
## A: 74.8074
## P VALUE:
## < 2.2e-16
##
## Description:
## Fri Feb 24 15:05:19 2023 by user: sarah
lmtest::bptest(expenses ~.,data = data_insurance, studentize = F)
##
## Breusch-Pagan test
##
## data: expenses ~ .
## BP = 243.98, df = 8, p-value < 2.2e-16
membagi data menjadi training testing
set.seed(123)
data_split <- initial_split(data = data_insurance,prop = 0.8)
train1 <- training(data_split)
test1 <- testing(data_split)
regresi2 <- lm(expenses ~.,data = train1)
prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
## 14 15 21 22 27 33
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506
prediksi <- predict(regresi2,newdata = test1)
prediksi <- predict(regresi2,newdata = test1)
head(prediksi)
## 14 15 21 22 27 33
## 9.347747 9.893421 9.580387 8.495743 9.490087 8.474506
RMSE
mlr3measures::rmse(response = prediksi,truth = test1$expenses)
## [1] 0.4415859
MAPE
mlr3measures::mape(response = prediksi,truth = test1$expenses)
## [1] 0.03175772
Spearman Correlation
mlr3measures::srho(response = prediksi,truth = test1$expenses)
## [1] 0.9039972
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.