packages <- c("psych", # quick summary stats for data exploration,
"stargazer", #summary stats for sharing,
"tidyverse", # data manipulation like selecting variables,
"corrplot", # correlation plots
"ggplot2", # graphing
"data.table", # reshape for graphing
"car",
"MASS",
"caret",
"glmnet",
"Metrics"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## Warning: 程辑包'psych'是用R版本4.3.1 来建造的
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
##
##
## 载入程辑包:'data.table'
##
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
##
## The following object is masked from 'package:purrr':
##
## transpose
##
##
## 载入需要的程辑包:carData
##
##
## 载入程辑包:'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## The following object is masked from 'package:psych':
##
## logit
##
##
##
## 载入程辑包:'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
## 载入需要的程辑包:lattice
##
##
## 载入程辑包:'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
## 将程序包安装入'C:/Users/80478/AppData/Local/R/win-library/4.3'
## (因为'lib'没有被指定)
##
## 还安装相依关系'lars'
## 程序包'lars'打开成功,MD5和检查也通过
## 程序包'glmnet'打开成功,MD5和检查也通过
##
## 下载的二进制程序包在
## C:\Users\80478\AppData\Local\Temp\Rtmp0eebQh\downloaded_packages里
## Warning: 程辑包'glmnet'是用R版本4.3.1 来建造的
## 载入需要的程辑包:Matrix
##
## 载入程辑包:'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
##
## 载入程辑包:'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
remove(list = ls())
setwd("D:/Study/Econometric/WorkDirectory")
Train <- read.csv("D:/Study/Econometric/WorkDirectory/insurance_training_data.csv")
Eval <- read.csv("D:/Study/Econometric/WorkDirectory/insurance-evaluation-data.csv")
mylabel1 <- c( 'Identification Variable',
'Was Car in a crash',
'If car was in a crash, what was the cost',
'# Driving Children',
'Age of Driver',
'# Children at Home',
'Years on Job',
'Income',
'Single Parent',
'Home Value',
'Marital Status',
'Gender',
'Max Education Level',
'Job Category',
'Distance to Work',
'Vehicle Use',
'Value of Vehicle ',
'Time in Force',
'Type of Car',
'A Red Car',
'Total Claims (Past 5 Years)',
'# Claims (Past 5 Years)',
'License Revoked (Past 7 Years)',
'Motor Vehicle Record Points',
'Vehicle Age',
'Home/Work Area'
)
mylabel2 <- c( '`Identification Variable`',
'`Was Car in a crash`',
'`If car was in a crash, what was the cost`',
'`# Driving Children`',
'`Age of Driver`',
'`# Children at Home`',
'`Years on Job`',
'Income',
'`Single Parent`',
'`Home Value`',
'`Marital Status`',
'Gender',
'`Max Education Level`',
'`Job Category`',
'`Distance to Work`',
'`Vehicle Use`',
'`Value of Vehicle `',
'`Time in Force`',
'`Type of Car`',
'`A Red Car`',
'`Total Claims (Past 5 Years)`',
'`# Claims (Past 5 Years)`',
'`License Revoked (Past 7 Years)`',
'`Motor Vehicle Record Points`',
'`Vehicle Age`',
'`Home/Work Area`')
colnames(Train) <- mylabel1
colnames(Eval) <- mylabel2
colSums(is.na(Train))
## Identification Variable
## 0
## Was Car in a crash
## 0
## If car was in a crash, what was the cost
## 0
## # Driving Children
## 0
## Age of Driver
## 6
## # Children at Home
## 0
## Years on Job
## 454
## Income
## 0
## Single Parent
## 0
## Home Value
## 0
## Marital Status
## 0
## Gender
## 0
## Max Education Level
## 0
## Job Category
## 0
## Distance to Work
## 0
## Vehicle Use
## 0
## Value of Vehicle
## 0
## Time in Force
## 0
## Type of Car
## 0
## A Red Car
## 0
## Total Claims (Past 5 Years)
## 0
## # Claims (Past 5 Years)
## 0
## License Revoked (Past 7 Years)
## 0
## Motor Vehicle Record Points
## 0
## Vehicle Age
## 510
## Home/Work Area
## 0
stargazer(Train,
type = "text"
)
##
## ====================================================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------------------------------------------
## Identification Variable 8,161 5,151.868 2,978.894 1 10,302
## Was Car in a crash 8,161 0.264 0.441 0 1
## If car was in a crash, what was the cost 8,161 1,504.325 4,704.027 0.000 107,586.100
## # Driving Children 8,161 0.171 0.512 0 4
## Age of Driver 8,155 44.790 8.628 16 81
## # Children at Home 8,161 0.721 1.116 0 5
## Years on Job 7,707 10.499 4.092 0 23
## Distance to Work 8,161 33.486 15.908 5 142
## Time in Force 8,161 5.351 4.147 1 25
## # Claims (Past 5 Years) 8,161 0.799 1.158 0 5
## Motor Vehicle Record Points 8,161 1.696 2.147 0 13
## Vehicle Age 7,651 8.328 5.701 -3 28
## ------------------------------------------------------------------------------------
Trainm <- melt(Train)
## Warning in melt(Train): data.table 中的 melt 泛型函数被传递了 data.frame
## 即将转向到 reshape2 中的相关方法;请注意 reshape2
## 已经弃用,这个转向也已经弃用。如果要在 data.table 和 reshape2
## 同时附着的情况下继续使用 reshape2 中的 melt 方法,(例如
## melt.list),你可以把命名空间写在函数名称前面,例如
## reshape2::melt(Train)。在下一个版本中,此警告将变成为错误。
## Using Income, Single Parent, Home Value, Marital Status, Gender, Max Education Level, Job Category, Vehicle Use, Value of Vehicle , Type of Car, A Red Car, Total Claims (Past 5 Years), License Revoked (Past 7 Years), Home/Work Area as id variables
ggplot(data = Trainm, aes(x=value)) + geom_histogram() + facet_wrap(~variable, scale='free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 970 rows containing non-finite values (`stat_bin()`).
Train[Train == ""] <- NA
Train$Income <- gsub("\\$", "", Train$Income)
Train$Income <- gsub(",", "", Train$Income)
Train$Income <- as.numeric(Train$Income)
Train$`Home Value` <- gsub("\\$", "", Train$`Home Value`)
Train$`Home Value` <- gsub(",", "", Train$`Home Value`)
Train$`Home Value` <- as.numeric(Train$`Home Value`)
Train$`Value of Vehicle ` <- gsub("\\$", "", Train$`Value of Vehicle `)
Train$`Value of Vehicle ` <- gsub(",", "", Train$`Value of Vehicle `)
Train$`Value of Vehicle ` <- as.numeric(Train$`Value of Vehicle `)
Train$`Total Claims (Past 5 Years)` <- gsub("\\$", "", Train$`Total Claims (Past 5 Years)`)
Train$`Total Claims (Past 5 Years)` <- gsub(",", "", Train$`Total Claims (Past 5 Years)`)
Train$`Total Claims (Past 5 Years)` <- as.numeric(Train$`Total Claims (Past 5 Years)`)
Train$Gender <- ifelse(Train$Gender == "M", 1, 0)
Train$Gender <- as.numeric(Train$Gender)
Train$`Marital Status` <- ifelse(Train$`Marital Status` == "Yes", 1, 0)
Train$`Marital Status` <- as.numeric(Train$`Marital Status`)
Train$`Single Parent` <- ifelse(Train$`Single Parent` == "Yes", 1, 0)
Train$`Single Parent` <- as.numeric(Train$`Single Parent`)
Train$`Vehicle Use` <- ifelse(Train$`Vehicle Use` == "Private", 1, 0)
Train$`Vehicle Use` <- as.numeric(Train$`Vehicle Use`)
Train$`A Red Car` <- ifelse(Train$`A Red Car` == "yes", 1, 0)
Train$`A Red Car` <- as.numeric(Train$`A Red Car`)
Train$`License Revoked (Past 7 Years)` <- ifelse(Train$`License Revoked (Past 7 Years)` == "Yes", 1, 0)
Train$`License Revoked (Past 7 Years)` <- as.numeric(Train$`License Revoked (Past 7 Years)`)
Train <- subset(Train, select = -c(`Job Category`, `Home/Work Area`))
colSums(is.na(Train))
## Identification Variable
## 0
## Was Car in a crash
## 0
## If car was in a crash, what was the cost
## 0
## # Driving Children
## 0
## Age of Driver
## 6
## # Children at Home
## 0
## Years on Job
## 454
## Income
## 445
## Single Parent
## 0
## Home Value
## 464
## Marital Status
## 0
## Gender
## 0
## Max Education Level
## 0
## Distance to Work
## 0
## Vehicle Use
## 0
## Value of Vehicle
## 0
## Time in Force
## 0
## Type of Car
## 0
## A Red Car
## 0
## Total Claims (Past 5 Years)
## 0
## # Claims (Past 5 Years)
## 0
## License Revoked (Past 7 Years)
## 0
## Motor Vehicle Record Points
## 0
## Vehicle Age
## 510
TrainNoNA <- na.omit(Train)
colSums(is.na(TrainNoNA))
## Identification Variable
## 0
## Was Car in a crash
## 0
## If car was in a crash, what was the cost
## 0
## # Driving Children
## 0
## Age of Driver
## 0
## # Children at Home
## 0
## Years on Job
## 0
## Income
## 0
## Single Parent
## 0
## Home Value
## 0
## Marital Status
## 0
## Gender
## 0
## Max Education Level
## 0
## Distance to Work
## 0
## Vehicle Use
## 0
## Value of Vehicle
## 0
## Time in Force
## 0
## Type of Car
## 0
## A Red Car
## 0
## Total Claims (Past 5 Years)
## 0
## # Claims (Past 5 Years)
## 0
## License Revoked (Past 7 Years)
## 0
## Motor Vehicle Record Points
## 0
## Vehicle Age
## 0
TrainNoNA$`Age of Driver` <- cut(TrainNoNA$`Age of Driver`, breaks = c(0, 18, 30, 40, 50, 60, Inf),
labels = c("0-18", "18-30", "30-40", "40-50", "50-60", ">60"))
f1 <- as.formula("`Max Education Level` ~ .")
f2 <- as.formula("`Type of Car` ~ .")
dummy1 <- dummyVars(f1, data = TrainNoNA)
TrainNoNA <- predict(dummy1, newdata = TrainNoNA)
TrainNoNA <- as.data.frame(TrainNoNA)
TrainC <- TrainNoNA[,-1]
min_value <- min(TrainC$`Was Car in a crash`, TrainC$Income, TrainC$`Home Value`)
constant <- min_value + 1
BC <- powerTransform(object = cbind(TrainC$`Was Car in a crash` + constant,
TrainC$Income + constant,
TrainC$`Home Value` + constant)
)
testTransform(object = BC, lambda = BC$lambda)
## LRT df pval
## LR test, lambda = (0.43) 0 1 1
CarCrash_transformed <- TrainC$`Was Car in a crash`^BC$lambda[1]
CarCrash_transformed
## numeric(0)
Income_transformed <- TrainC$Income^BC$lambda[1]
Homevalue_transformed <- TrainC$`Home Value`^BC$lambda[1]
train_corr <- subset(TrainNoNA, select = -c(TrainNoNA$`\`Identification Variable\``))
corrplot(cor(train_corr, use = "na.or.complete"),
type = "lower",
method = "square"
)
stargazer(TrainC,
type = "text"
)
##
## =========================================================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------------------------------------
## `Was Car in a crash` 6,448 0.264 0.441 0 1
## `If car was in a crash, what was the cost` 6,448 1,497.018 4,665.921 0.000 85,523.650
## `# Driving Children` 6,448 0.169 0.509 0 4
## `Age of Driver`0-18 6,448 0.001 0.028 0 1
## `Age of Driver`18-30 6,448 0.051 0.220 0 1
## `Age of Driver`30-40 6,448 0.265 0.441 0 1
## `Age of Driver`40-50 6,448 0.423 0.494 0 1
## `Age of Driver`50-60 6,448 0.226 0.418 0 1
## `Age of Driver`> 60 6,448 0.034 0.181 0 1
## `# Children at Home` 6,448 0.725 1.122 0 5
## `Years on Job` 6,448 10.539 4.068 0 23
## Income 6,448 62,019.480 47,430.270 0 367,030
## `Single Parent` 6,448 0.133 0.340 0 1
## `Home Value` 6,448 155,438.200 129,655.300 0 885,282
## `Marital Status` 6,448 0.594 0.491 0 1
## Gender 6,448 0.465 0.499 0 1
## `Distance to Work` 6,448 33.662 15.845 5 142
## `Vehicle Use` 6,448 0.627 0.484 0 1
## `Value of Vehicle ` 6,448 15,760.730 8,434.251 1,500 69,740
## `Time in Force` 6,448 5.377 4.149 1 25
## `Type of Car`Minivan 6,448 0.266 0.442 0 1
## `Type of Car`Panel Truck 6,448 0.082 0.274 0 1
## `Type of Car`Pickup 6,448 0.172 0.378 0 1
## `Type of Car`Sports Car 6,448 0.112 0.315 0 1
## `Type of Car`Van 6,448 0.089 0.285 0 1
## `Type of Car`z_SUV 6,448 0.279 0.448 0 1
## `A Red Car` 6,448 0.292 0.455 0 1
## `Total Claims (Past 5 Years)` 6,448 4,046.429 8,847.249 0 57,037
## `# Claims (Past 5 Years)` 6,448 0.795 1.162 0 5
## `License Revoked (Past 7 Years)` 6,448 0.123 0.328 0 1
## `Motor Vehicle Record Points` 6,448 1.705 2.159 0 13
## `Vehicle Age` 6,448 8.296 5.715 -3 28
## -----------------------------------------------------------------------------------------
LinearM <- lm(data = TrainC,
TrainC$`\`Was Car in a crash\``~ .
)
LogiModel1 <- glm(formula = TrainC$`\`Was Car in a crash\`` ~ TrainC$`\`Distance to Work\`` + TrainC$`\`Vehicle Use\``,
data = TrainC,
family = binomial(link = "logit")
)
LogiModel2 <- glm(formula = TrainC$`\`Was Car in a crash\`` ~ TrainC$`\`Total Claims (Past 5 Years)\`` + TrainC$`\`License Revoked (Past 7 Years)\``,
data = TrainC,
family = binomial(link = "logit")
)
MatrixT <- as.matrix(TrainC)
X <- MatrixT[, -3]
Y <- TrainC$`\`Was Car in a crash\``
ridge_model <- glmnet(X, Y, alpha = 0)
lasso_model <- glmnet(X, Y, alpha = 1)
stargazer(LogiModel1, LogiModel2, LinearM,
type = 'text' )
##
## =============================================================================================
## Dependent variable:
## ------------------------------------------------
## ``Was Car in a crash``
## logistic OLS
## (1) (2) (3)
## ---------------------------------------------------------------------------------------------
## ``Distance to Work`` 0.007***
## (0.002)
##
## ``Vehicle Use`` -0.661***
## (0.058)
##
## ``Total Claims (Past 5 Years)`` 0.00002***
## (0.00000)
##
## ``License Revoked (Past 7 Years)`` 0.654***
## (0.088)
##
## ``If car was in a crash, what was the cost`` 0.00004***
## (0.00000)
##
## ``# Driving Children`` 0.045***
## (0.010)
##
## ``Age of Driver`0-18` -0.148
## (0.158)
##
## ``Age of Driver`18-30` 0.045
## (0.033)
##
## ``Age of Driver`30-40` -0.039
## (0.027)
##
## ``Age of Driver`40-50` -0.078***
## (0.026)
##
## ``Age of Driver`50-60` -0.047*
## (0.027)
##
## ``Age of Driver`> 60`
##
##
## ``# Children at Home`` -0.002
## (0.005)
##
## ``Years on Job`` -0.001
## (0.001)
##
## Income -0.00000
## (0.00000)
##
## ``Single Parent`` 0.057***
## (0.017)
##
## ``Home Value`` -0.00000***
## (0.00000)
##
## ``Marital Status`` -0.029**
## (0.013)
##
## Gender -0.003
## (0.016)
##
## ``Distance to Work`` 0.001***
## (0.0003)
##
## ``Vehicle Use`` -0.093***
## (0.011)
##
## ``Value of Vehicle `` -0.00000***
## (0.00000)
##
## ``Time in Force`` -0.005***
## (0.001)
##
## ``Type of Car`Minivan` -0.055***
## (0.016)
##
## ``Type of Car`Panel Truck` -0.014
## (0.028)
##
## ``Type of Car`Pickup` -0.013
## (0.017)
##
## ``Type of Car`Sports Car` 0.017
## (0.016)
##
## ``Type of Car`Van` 0.001
## (0.023)
##
## ``Type of Car`z_SUV`
##
##
## ``A Red Car`` -0.014
## (0.013)
##
## ``Total Claims (Past 5 Years)`` -0.00000***
## (0.00000)
##
## ``# Claims (Past 5 Years)`` 0.043***
## (0.005)
##
## ``License Revoked (Past 7 Years)`` 0.144***
## (0.015)
##
## ``Motor Vehicle Record Points`` 0.018***
## (0.002)
##
## ``Vehicle Age`` -0.002**
## (0.001)
##
## Constant -0.884*** -1.211*** 0.377***
## (0.075) (0.033) (0.037)
##
## ---------------------------------------------------------------------------------------------
## Observations 6,448 6,448 6,448
## R2 0.380
## Adjusted R2 0.377
## Log Likelihood -3,647.764 -3,638.634
## Akaike Inf. Crit. 7,301.528 7,283.269
## Residual Std. Error 0.348 (df = 6418)
## F Statistic 135.689*** (df = 29; 6418)
## =============================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
summary(ridge_model)
## Length Class Mode
## a0 100 -none- numeric
## beta 3100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
summary(lasso_model)
## Length Class Mode
## a0 39 -none- numeric
## beta 1209 dgCMatrix S4
## df 39 -none- numeric
## dim 2 -none- numeric
## lambda 39 -none- numeric
## dev.ratio 39 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
AIC(LinearM)
## [1] 4715.215
plot(ridge_model, xvar = "lambda", label = TRUE,
title("Ridge Regression Coefficients"))
plot(lasso_model, xvar = "lambda", label = TRUE,
title("Lasso Regression Coefficients"))
## Warning in plotCoef(x$beta, lambda = x$lambda, df = x$df, dev = x$dev.ratio, :
## 1 or less nonzero coefficients; glmnet plot is not meaningful
plot(LinearM)
lp1<- predict(LogiModel1, TrainC, type = "response")
logistic_binary_pred1 <- ifelse(lp1 > 0.5, 1, 0)
logistic_mae1 <- mae(TrainC$`\`Was Car in a crash\``, logistic_binary_pred1)
r2forLoM1 <- cor(TrainC$`\`Was Car in a crash\``, lp1)^2
cat("Logistic MAE:", logistic_mae1, "R2:", r2forLoM1)
## Logistic MAE: 0.264268 R2: 0.02266993
lp2<- predict(LogiModel2, TrainC, type = "response")
logistic_binary_pred2 <- ifelse(lp2 > 0.5, 1, 0)
logistic_mae2 <- mae(TrainC$`\`Was Car in a crash\``, logistic_binary_pred2)
r2forLoM2 <- cor(TrainC$`\`Was Car in a crash\``, lp2)^2
cat("Logistic MAE:", logistic_mae2, "R2:", r2forLoM2)
## Logistic MAE: 0.2608561 R2: 0.02755773
```