persinj <- read.csv('persinj.csv')
str(persinj)
## 'data.frame': 22036 obs. of 4 variables:
## $ amt : num 87.8 353.6 688.8 172.8 43.3 ...
## $ inj : int 1 1 1 1 1 6 1 1 1 1 ...
## $ legrep : int 0 0 0 0 0 0 0 0 0 0 ...
## $ op_time: num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
head(persinj, 5)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
persinj %>%
ggplot(aes(x = amt)) +
geom_histogram() +
xlim(c(0, 300000)) +
ylim(0, 2000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 379 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).
persinj %>%
mutate(amt = log(amt)) %>%
ggplot(aes(x = amt)) +
geom_histogram(bins=50) +
xlim(c(0, 20)) +
ylim(0, 2000)
## Warning: Removed 7 rows containing missing values (`geom_bar()`).
### explore the predictors
##### change inj to factor type
persinj$inj <- factor(persinj$inj)
summary(persinj)
## amt inj legrep op_time
## Min. : 10 1:15638 Min. :0.0000 Min. : 0.10
## 1st Qu.: 6297 2: 3376 1st Qu.:0.0000 1st Qu.:23.00
## Median : 13854 3: 1133 Median :1.0000 Median :45.90
## Mean : 38367 4: 189 Mean :0.6366 Mean :46.33
## 3rd Qu.: 35123 5: 188 3rd Qu.:1.0000 3rd Qu.:69.30
## Max. :4485797 6: 256 Max. :1.0000 Max. :99.10
## 9: 1256
table(persinj$legrep)
##
## 0 1
## 8008 14028
persinj %>%
ggplot(aes(x = op_time)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### split the dataset to training and test data
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(1)
partition <- createDataPartition(persinj$amt, p=0.75, list=F)
trains <- persinj[partition, ]
tests <- persinj[-partition, ]
summary(trains$amt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10 6297 13855 38250 35123 2798362
summary(tests$amt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21 6298 13852 38719 35115 4485797
trains %>% ggplot(aes(x=amt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### set benchmart model
lm.model <- lm(log(amt) ~ inj + op_time * legrep, data=trains)
summary(lm.model)
##
## Call:
## lm(formula = log(amt) ~ inj + op_time * legrep, data = trains)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6878 -0.5922 0.0636 0.6686 4.3130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.4931390 0.0265320 282.419 < 2e-16 ***
## inj2 0.6093366 0.0236972 25.713 < 2e-16 ***
## inj3 0.8097411 0.0383287 21.126 < 2e-16 ***
## inj4 0.7603019 0.0913781 8.320 < 2e-16 ***
## inj5 0.4876492 0.0905641 5.385 7.36e-08 ***
## inj6 0.4023193 0.0774957 5.192 2.11e-07 ***
## inj9 -0.8341865 0.0365674 -22.812 < 2e-16 ***
## op_time 0.0363561 0.0005043 72.093 < 2e-16 ***
## legrep 0.9384924 0.0337097 27.840 < 2e-16 ***
## op_time:legrep -0.0105613 0.0006339 -16.661 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.07 on 16518 degrees of freedom
## Multiple R-squared: 0.466, Adjusted R-squared: 0.4657
## F-statistic: 1602 on 9 and 16518 DF, p-value: < 2.2e-16
# plot(lm.model)
RMSE(exp(predict(lm.model, newdata = trains)), trains$amt)
## [1] 79215.65
RMSE(exp(predict(lm.model, newdata = tests)), tests$amt)
## [1] 101478.7
glm.gaussian <- glm(amt ~ . + legrep:op_time, data = trains,
family=gaussian('log'))
summary(glm.gaussian)
##
## Call:
## glm(formula = amt ~ . + legrep:op_time, family = gaussian("log"),
## data = trains)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -408800 -17174 -4797 2918 2610493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.567945 0.105417 81.277 < 2e-16 ***
## inj2 0.606859 0.027083 22.407 < 2e-16 ***
## inj3 0.771353 0.031888 24.189 < 2e-16 ***
## inj4 0.897270 0.051467 17.434 < 2e-16 ***
## inj5 1.477990 0.031896 46.338 < 2e-16 ***
## inj6 1.237127 0.045138 27.408 < 2e-16 ***
## inj9 -0.520156 0.122714 -4.239 2.26e-05 ***
## legrep -0.021037 0.124004 -0.170 0.8653
## op_time 0.027709 0.001314 21.095 < 2e-16 ***
## legrep:op_time 0.003417 0.001529 2.235 0.0254 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5411311195)
##
## Null deviance: 1.2002e+14 on 16527 degrees of freedom
## Residual deviance: 8.9384e+13 on 16518 degrees of freedom
## AIC: 417338
##
## Number of Fisher Scoring iterations: 8
RMSE(predict(glm.gaussian, newdata=trains, type="response"), trains$amt)
## [1] 73539.42
RMSE(predict(glm.gaussian, newdata=tests, type="response"), tests$amt)
## [1] 96340.05
glm.gamma <- glm(amt ~. + legrep:op_time, data=trains, family = Gamma('log'))
summary(glm.gamma)
##
## Call:
## glm(formula = amt ~ . + legrep:op_time, family = Gamma("log"),
## data = trains)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5327 -0.9180 -0.3689 0.1823 8.2436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1915509 0.0321717 254.620 < 2e-16 ***
## inj2 0.6218494 0.0287343 21.641 < 2e-16 ***
## inj3 0.8391057 0.0464759 18.055 < 2e-16 ***
## inj4 1.0224844 0.1108016 9.228 < 2e-16 ***
## inj5 1.3016924 0.1098147 11.854 < 2e-16 ***
## inj6 0.7995407 0.0939684 8.509 < 2e-16 ***
## inj9 -0.6205220 0.0443403 -13.995 < 2e-16 ***
## legrep 0.4626597 0.0408752 11.319 < 2e-16 ***
## op_time 0.0346083 0.0006115 56.597 < 2e-16 ***
## legrep:op_time -0.0052821 0.0007686 -6.872 6.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.68352)
##
## Null deviance: 32845 on 16527 degrees of freedom
## Residual deviance: 16665 on 16518 degrees of freedom
## AIC: 365722
##
## Number of Fisher Scoring iterations: 8
RMSE(predict(glm.gamma, newdata=trains, type="response"), trains$amt)
## [1] 74082.73
RMSE(predict(glm.gamma, newdata=tests, type="response"), tests$amt)
## [1] 96849.87
# glm.inverse_gaussian <- glm(amt ~ inj + legrep + op_time, data=trains, family=inverse.gaussian(link = "log"))
#
#
# summary(glm.inverse_gaussian)
#
# RMSE(predict(glm.inverse_gaussian, newdata=trains, type="response"), trains$amt)
# RMSE(predict(glm.inverse_gaussian, newdata=tests, type="response"), tests$amt)
#