ACTEX Study Manual for Exam PA: Section 4.2 (Case Study 1: GLMs for Continuous Target Variables)

read the data

persinj <- read.csv('persinj.csv')

str(persinj)

## 'data.frame':    22036 obs. of  4 variables:
##  $ amt    : num  87.8 353.6 688.8 172.8 43.3 ...
##  $ inj    : int  1 1 1 1 1 6 1 1 1 1 ...
##  $ legrep : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ op_time: num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...

head(persinj, 5)

explore the target variable

library(ggplot2)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

persinj %>% 
  ggplot(aes(x = amt)) +
  geom_histogram() +
  xlim(c(0, 300000)) +
  ylim(0, 2000)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 379 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 4 rows containing missing values (`geom_bar()`).

persinj %>% 
  mutate(amt = log(amt)) %>% 
  ggplot(aes(x = amt)) +
  geom_histogram(bins=50) +
  xlim(c(0, 20)) +
  ylim(0, 2000)

## Warning: Removed 7 rows containing missing values (`geom_bar()`).

### explore the predictors
##### change inj to factor type

persinj$inj <- factor(persinj$inj)
summary(persinj)

##       amt          inj           legrep          op_time     
##  Min.   :     10   1:15638   Min.   :0.0000   Min.   : 0.10  
##  1st Qu.:   6297   2: 3376   1st Qu.:0.0000   1st Qu.:23.00  
##  Median :  13854   3: 1133   Median :1.0000   Median :45.90  
##  Mean   :  38367   4:  189   Mean   :0.6366   Mean   :46.33  
##  3rd Qu.:  35123   5:  188   3rd Qu.:1.0000   3rd Qu.:69.30  
##  Max.   :4485797   6:  256   Max.   :1.0000   Max.   :99.10  
##                    9: 1256

count the legrep

table(persinj$legrep)

## 
##     0     1 
##  8008 14028

examine the dist. of op_time

persinj %>% 
  ggplot(aes(x = op_time)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### split the dataset to training and test data

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(1)
partition <- createDataPartition(persinj$amt, p=0.75, list=F)

trains <- persinj[partition, ]
tests <- persinj[-partition, ]

summary(trains$amt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      10    6297   13855   38250   35123 2798362

summary(tests$amt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      21    6298   13852   38719   35115 4485797

trains %>% ggplot(aes(x=amt)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### set benchmart model

lm.model <- lm(log(amt) ~ inj + op_time * legrep, data=trains)
summary(lm.model)

## 
## Call:
## lm(formula = log(amt) ~ inj + op_time * legrep, data = trains)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6878 -0.5922  0.0636  0.6686  4.3130 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.4931390  0.0265320 282.419  < 2e-16 ***
## inj2            0.6093366  0.0236972  25.713  < 2e-16 ***
## inj3            0.8097411  0.0383287  21.126  < 2e-16 ***
## inj4            0.7603019  0.0913781   8.320  < 2e-16 ***
## inj5            0.4876492  0.0905641   5.385 7.36e-08 ***
## inj6            0.4023193  0.0774957   5.192 2.11e-07 ***
## inj9           -0.8341865  0.0365674 -22.812  < 2e-16 ***
## op_time         0.0363561  0.0005043  72.093  < 2e-16 ***
## legrep          0.9384924  0.0337097  27.840  < 2e-16 ***
## op_time:legrep -0.0105613  0.0006339 -16.661  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.07 on 16518 degrees of freedom
## Multiple R-squared:  0.466,  Adjusted R-squared:  0.4657 
## F-statistic:  1602 on 9 and 16518 DF,  p-value: < 2.2e-16

# plot(lm.model)

RMSE(exp(predict(lm.model, newdata = trains)), trains$amt)

## [1] 79215.65

RMSE(exp(predict(lm.model, newdata = tests)), tests$amt)

## [1] 101478.7

GLM model with Gaussian family and log link function

glm.gaussian <- glm(amt ~ . + legrep:op_time, data = trains,
                    family=gaussian('log'))
summary(glm.gaussian)

## 
## Call:
## glm(formula = amt ~ . + legrep:op_time, family = gaussian("log"), 
##     data = trains)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -408800   -17174    -4797     2918  2610493  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.567945   0.105417  81.277  < 2e-16 ***
## inj2            0.606859   0.027083  22.407  < 2e-16 ***
## inj3            0.771353   0.031888  24.189  < 2e-16 ***
## inj4            0.897270   0.051467  17.434  < 2e-16 ***
## inj5            1.477990   0.031896  46.338  < 2e-16 ***
## inj6            1.237127   0.045138  27.408  < 2e-16 ***
## inj9           -0.520156   0.122714  -4.239 2.26e-05 ***
## legrep         -0.021037   0.124004  -0.170   0.8653    
## op_time         0.027709   0.001314  21.095  < 2e-16 ***
## legrep:op_time  0.003417   0.001529   2.235   0.0254 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5411311195)
## 
##     Null deviance: 1.2002e+14  on 16527  degrees of freedom
## Residual deviance: 8.9384e+13  on 16518  degrees of freedom
## AIC: 417338
## 
## Number of Fisher Scoring iterations: 8

RMSE(predict(glm.gaussian, newdata=trains, type="response"), trains$amt)

## [1] 73539.42

RMSE(predict(glm.gaussian, newdata=tests, type="response"), tests$amt)

## [1] 96340.05

GLM model with Gamma family and log link function

glm.gamma <- glm(amt ~. + legrep:op_time, data=trains, family = Gamma('log'))

summary(glm.gamma)

## 
## Call:
## glm(formula = amt ~ . + legrep:op_time, family = Gamma("log"), 
##     data = trains)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5327  -0.9180  -0.3689   0.1823   8.2436  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.1915509  0.0321717 254.620  < 2e-16 ***
## inj2            0.6218494  0.0287343  21.641  < 2e-16 ***
## inj3            0.8391057  0.0464759  18.055  < 2e-16 ***
## inj4            1.0224844  0.1108016   9.228  < 2e-16 ***
## inj5            1.3016924  0.1098147  11.854  < 2e-16 ***
## inj6            0.7995407  0.0939684   8.509  < 2e-16 ***
## inj9           -0.6205220  0.0443403 -13.995  < 2e-16 ***
## legrep          0.4626597  0.0408752  11.319  < 2e-16 ***
## op_time         0.0346083  0.0006115  56.597  < 2e-16 ***
## legrep:op_time -0.0052821  0.0007686  -6.872 6.55e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.68352)
## 
##     Null deviance: 32845  on 16527  degrees of freedom
## Residual deviance: 16665  on 16518  degrees of freedom
## AIC: 365722
## 
## Number of Fisher Scoring iterations: 8

RMSE(predict(glm.gamma, newdata=trains, type="response"), trains$amt)

## [1] 74082.73

RMSE(predict(glm.gamma, newdata=tests, type="response"), tests$amt)

## [1] 96849.87

GLM model with Inverse gaussian family and log link function

# glm.inverse_gaussian <- glm(amt ~ inj + legrep + op_time, data=trains, family=inverse.gaussian(link = "log"))
# 
# 
# summary(glm.inverse_gaussian)
# 
# RMSE(predict(glm.inverse_gaussian, newdata=trains, type="response"), trains$amt)
# RMSE(predict(glm.inverse_gaussian, newdata=tests, type="response"), tests$amt)
#

ACTEX Study Manual for Exam PA: Section 4.2 (Case Study 1: GLMs for Continuous Target Variables)

yangj22

read the data

explore the target variable

count the legrep

examine the dist. of op_time

GLM model with Gaussian family and log link function

GLM model with Gamma family and log link function

GLM model with Inverse gaussian family and log link function