Lendo o conjunto de dados
dataset = read.csv('insurance.csv')
Codificando atributos como fatores
dataset$sex = as.numeric(factor(dataset$sex,
levels = c('female', 'male'),
labels = c(1, 2)))
dataset$smoker = as.numeric(factor(dataset$smoker,
levels = c('no', 'yes'),
labels = c(0, 1)))
Dividindo o conjunto de dados em treinamento e teste
# install.packages('caTools')
library(caTools)
split = sample.split(dataset$charges, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
Regressão linear simples no conjunto de treinamento
regressor = lm(formula = charges ~ age ,
data = training_set)
summary(regressor)
##
## Call:
## lm(formula = charges ~ age, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8231 -6836 -6110 5423 47598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3026.6 1079.1 2.805 0.00513 **
## age 266.0 26.2 10.152 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11870 on 1068 degrees of freedom
## Multiple R-squared: 0.08801, Adjusted R-squared: 0.08716
## F-statistic: 103.1 on 1 and 1068 DF, p-value: < 2.2e-16
y_pred = predict(regressor, newdata = test_set)
library(ggplot2)
ggplot() +
geom_point(aes(x = training_set$age, y = training_set$charges),
colour = 'red') +
geom_line(aes(x = training_set$age, y = predict(regressor, newdata = training_set)),
colour = 'blue') +
ggtitle('Custos x Idade') +
xlab('Idade') +
ylab('Custos')
