read the data

persinj <- read.csv('persinj.csv')

str(persinj)
## 'data.frame':    22036 obs. of  4 variables:
##  $ amt    : num  87.8 353.6 688.8 172.8 43.3 ...
##  $ inj    : int  1 1 1 1 1 6 1 1 1 1 ...
##  $ legrep : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ op_time: num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
head(persinj, 5)

explore the target variable

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
persinj %>% 
  ggplot(aes(x = amt)) +
  geom_histogram() +
  xlim(c(0, 300000)) +
  ylim(0, 2000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 379 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

persinj %>% 
  mutate(amt = log(amt)) %>% 
  ggplot(aes(x = amt)) +
  geom_histogram(bins=50) +
  xlim(c(0, 20)) +
  ylim(0, 2000)
## Warning: Removed 7 rows containing missing values (`geom_bar()`).

### explore the predictors
##### change inj to factor type

persinj$inj <- factor(persinj$inj)
summary(persinj)
##       amt          inj           legrep          op_time     
##  Min.   :     10   1:15638   Min.   :0.0000   Min.   : 0.10  
##  1st Qu.:   6297   2: 3376   1st Qu.:0.0000   1st Qu.:23.00  
##  Median :  13854   3: 1133   Median :1.0000   Median :45.90  
##  Mean   :  38367   4:  189   Mean   :0.6366   Mean   :46.33  
##  3rd Qu.:  35123   5:  188   3rd Qu.:1.0000   3rd Qu.:69.30  
##  Max.   :4485797   6:  256   Max.   :1.0000   Max.   :99.10  
##                    9: 1256
count the legrep
table(persinj$legrep)
## 
##     0     1 
##  8008 14028
examine the dist. of op_time
persinj %>% 
  ggplot(aes(x = op_time)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### split the dataset to training and test data

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(1)
partition <- createDataPartition(persinj$amt, p=0.75, list=F)

trains <- persinj[partition, ]
tests <- persinj[-partition, ]

summary(trains$amt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      10    6297   13855   38250   35123 2798362
summary(tests$amt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      21    6298   13852   38719   35115 4485797
trains %>% ggplot(aes(x=amt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### set benchmart model

lm.model <- lm(log(amt) ~ inj + op_time * legrep, data=trains)
summary(lm.model)
## 
## Call:
## lm(formula = log(amt) ~ inj + op_time * legrep, data = trains)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6878 -0.5922  0.0636  0.6686  4.3130 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.4931390  0.0265320 282.419  < 2e-16 ***
## inj2            0.6093366  0.0236972  25.713  < 2e-16 ***
## inj3            0.8097411  0.0383287  21.126  < 2e-16 ***
## inj4            0.7603019  0.0913781   8.320  < 2e-16 ***
## inj5            0.4876492  0.0905641   5.385 7.36e-08 ***
## inj6            0.4023193  0.0774957   5.192 2.11e-07 ***
## inj9           -0.8341865  0.0365674 -22.812  < 2e-16 ***
## op_time         0.0363561  0.0005043  72.093  < 2e-16 ***
## legrep          0.9384924  0.0337097  27.840  < 2e-16 ***
## op_time:legrep -0.0105613  0.0006339 -16.661  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.07 on 16518 degrees of freedom
## Multiple R-squared:  0.466,  Adjusted R-squared:  0.4657 
## F-statistic:  1602 on 9 and 16518 DF,  p-value: < 2.2e-16
# plot(lm.model)

RMSE(exp(predict(lm.model, newdata = trains)), trains$amt)
## [1] 79215.65
RMSE(exp(predict(lm.model, newdata = tests)), tests$amt)
## [1] 101478.7