I. Data Exploration

packages <- c("psych", # quick summary stats for data exploration,
              "stargazer", #summary stats for sharing,
              "tidyverse", # data manipulation like selecting variables,
              "corrplot", # correlation plots
              "ggplot2", # graphing
              "data.table", # reshape for graphing 
              "car",
              "MASS",
              "caret",
              "glmnet",
              "Metrics"
              )
 for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }

## Warning: 程辑包'psych'是用R版本4.3.1 来建造的

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
## 
## 
## 载入程辑包：'data.table'
## 
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## 
## 载入需要的程辑包：carData
## 
## 
## 载入程辑包：'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## 
## 
## 载入程辑包：'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 载入需要的程辑包：lattice
## 
## 
## 载入程辑包：'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## 将程序包安装入'C:/Users/80478/AppData/Local/R/win-library/4.3'
## (因为'lib'没有被指定)
## 
## 还安装相依关系'lars'

## 程序包'lars'打开成功，MD5和检查也通过
## 程序包'glmnet'打开成功，MD5和检查也通过
## 
## 下载的二进制程序包在
##  C:\Users\80478\AppData\Local\Temp\Rtmp0eebQh\downloaded_packages里

## Warning: 程辑包'glmnet'是用R版本4.3.1 来建造的

## 载入需要的程辑包：Matrix
## 
## 载入程辑包：'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
## 
## 载入程辑包：'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

remove(list = ls())

setwd("D:/Study/Econometric/WorkDirectory")

Train <- read.csv("D:/Study/Econometric/WorkDirectory/insurance_training_data.csv")

Eval <- read.csv("D:/Study/Econometric/WorkDirectory/insurance-evaluation-data.csv")

mylabel1 <- c( 'Identification Variable', 
               'Was Car in a crash',
               'If car was in a crash, what was the cost',
               '# Driving Children',
               'Age of Driver',
               '# Children at Home',
               'Years on Job',
               'Income',
               'Single Parent',
               'Home Value',
               'Marital Status',
               'Gender',
               'Max Education Level',
               'Job Category',
               'Distance to Work',
               'Vehicle Use',
               'Value of Vehicle ',
               'Time in Force',
               'Type of Car',
               'A Red Car',
               'Total Claims (Past 5 Years)',
               '# Claims (Past 5 Years)',
               'License Revoked (Past 7 Years)',
               'Motor Vehicle Record Points',
               'Vehicle Age',
               'Home/Work Area'
               
  
)

mylabel2 <- c( '`Identification Variable`', 
               '`Was Car in a crash`',
               '`If car was in a crash, what was the cost`',
               '`# Driving Children`',
               '`Age of Driver`',
               '`# Children at Home`',
               '`Years on Job`',
               'Income',
               '`Single Parent`',
               '`Home Value`',
               '`Marital Status`',
               'Gender',
               '`Max Education Level`',
               '`Job Category`',
               '`Distance to Work`',
               '`Vehicle Use`',
               '`Value of Vehicle `',
               '`Time in Force`',
               '`Type of Car`',
               '`A Red Car`',
               '`Total Claims (Past 5 Years)`',
               '`# Claims (Past 5 Years)`',
               '`License Revoked (Past 7 Years)`',
               '`Motor Vehicle Record Points`',
               '`Vehicle Age`',
               '`Home/Work Area`')
               
  

colnames(Train) <- mylabel1

colnames(Eval) <- mylabel2

colSums(is.na(Train))

##                  Identification Variable 
##                                        0 
##                       Was Car in a crash 
##                                        0 
## If car was in a crash, what was the cost 
##                                        0 
##                       # Driving Children 
##                                        0 
##                            Age of Driver 
##                                        6 
##                       # Children at Home 
##                                        0 
##                             Years on Job 
##                                      454 
##                                   Income 
##                                        0 
##                            Single Parent 
##                                        0 
##                               Home Value 
##                                        0 
##                           Marital Status 
##                                        0 
##                                   Gender 
##                                        0 
##                      Max Education Level 
##                                        0 
##                             Job Category 
##                                        0 
##                         Distance to Work 
##                                        0 
##                              Vehicle Use 
##                                        0 
##                        Value of Vehicle  
##                                        0 
##                            Time in Force 
##                                        0 
##                              Type of Car 
##                                        0 
##                                A Red Car 
##                                        0 
##              Total Claims (Past 5 Years) 
##                                        0 
##                  # Claims (Past 5 Years) 
##                                        0 
##           License Revoked (Past 7 Years) 
##                                        0 
##              Motor Vehicle Record Points 
##                                        0 
##                              Vehicle Age 
##                                      510 
##                           Home/Work Area 
##                                        0

stargazer(Train,
          type = "text"
         )

## 
## ====================================================================================
## Statistic                                  N     Mean    St. Dev.   Min      Max    
## ------------------------------------------------------------------------------------
## Identification Variable                  8,161 5,151.868 2,978.894   1     10,302   
## Was Car in a crash                       8,161   0.264     0.441     0        1     
## If car was in a crash, what was the cost 8,161 1,504.325 4,704.027 0.000 107,586.100
## # Driving Children                       8,161   0.171     0.512     0        4     
## Age of Driver                            8,155  44.790     8.628    16       81     
## # Children at Home                       8,161   0.721     1.116     0        5     
## Years on Job                             7,707  10.499     4.092     0       23     
## Distance to Work                         8,161  33.486    15.908     5       142    
## Time in Force                            8,161   5.351     4.147     1       25     
## # Claims (Past 5 Years)                  8,161   0.799     1.158     0        5     
## Motor Vehicle Record Points              8,161   1.696     2.147     0       13     
## Vehicle Age                              7,651   8.328     5.701    -3       28     
## ------------------------------------------------------------------------------------

Trainm <- melt(Train)

## Warning in melt(Train): data.table 中的 melt 泛型函数被传递了 data.frame
## 即将转向到 reshape2 中的相关方法；请注意 reshape2
## 已经弃用，这个转向也已经弃用。如果要在 data.table 和 reshape2
## 同时附着的情况下继续使用 reshape2 中的 melt 方法，（例如
## melt.list），你可以把命名空间写在函数名称前面，例如
## reshape2::melt(Train)。在下一个版本中，此警告将变成为错误。

## Using Income, Single Parent, Home Value, Marital Status, Gender, Max Education Level, Job Category, Vehicle Use, Value of Vehicle , Type of Car, A Red Car, Total Claims (Past 5 Years), License Revoked (Past 7 Years), Home/Work Area as id variables

ggplot(data = Trainm, aes(x=value)) + geom_histogram() + facet_wrap(~variable, scale='free')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 970 rows containing non-finite values (`stat_bin()`).

In this summary chart I can see there are a lot of variables missing becuase they were treated as character variables. Due to those characteristic variables, I cannot creat a correlation plot, so in the next section I will fix that.Also, I think the home and work area is somewhat similar with the distance to work variables, so I will drop it.

II. Data Prep

Train[Train == ""] <- NA

Train$Income <- gsub("\\$", "", Train$Income)
Train$Income <- gsub(",", "", Train$Income)
Train$Income <- as.numeric(Train$Income)

Train$`Home Value` <- gsub("\\$", "", Train$`Home Value`)
Train$`Home Value` <- gsub(",", "", Train$`Home Value`)
Train$`Home Value` <- as.numeric(Train$`Home Value`)

Train$`Value of Vehicle ` <- gsub("\\$", "", Train$`Value of Vehicle `)
Train$`Value of Vehicle ` <- gsub(",", "", Train$`Value of Vehicle `)
Train$`Value of Vehicle ` <- as.numeric(Train$`Value of Vehicle `)

Train$`Total Claims (Past 5 Years)` <- gsub("\\$", "", Train$`Total Claims (Past 5 Years)`)
Train$`Total Claims (Past 5 Years)` <- gsub(",", "", Train$`Total Claims (Past 5 Years)`)
Train$`Total Claims (Past 5 Years)` <- as.numeric(Train$`Total Claims (Past 5 Years)`)

Train$Gender <- ifelse(Train$Gender == "M", 1, 0)
Train$Gender <- as.numeric(Train$Gender)

Train$`Marital Status` <- ifelse(Train$`Marital Status` == "Yes", 1, 0)
Train$`Marital Status` <- as.numeric(Train$`Marital Status`)

Train$`Single Parent` <- ifelse(Train$`Single Parent` == "Yes", 1, 0)
Train$`Single Parent` <- as.numeric(Train$`Single Parent`)

Train$`Vehicle Use` <- ifelse(Train$`Vehicle Use` == "Private", 1, 0)
Train$`Vehicle Use` <- as.numeric(Train$`Vehicle Use`)

Train$`A Red Car` <- ifelse(Train$`A Red Car` == "yes", 1, 0)
Train$`A Red Car` <- as.numeric(Train$`A Red Car`)

Train$`License Revoked (Past 7 Years)` <- ifelse(Train$`License Revoked (Past 7 Years)` == "Yes", 1, 0)
Train$`License Revoked (Past 7 Years)` <- as.numeric(Train$`License Revoked (Past 7 Years)`)

Train <- subset(Train, select = -c(`Job Category`, `Home/Work Area`))

colSums(is.na(Train))

##                  Identification Variable 
##                                        0 
##                       Was Car in a crash 
##                                        0 
## If car was in a crash, what was the cost 
##                                        0 
##                       # Driving Children 
##                                        0 
##                            Age of Driver 
##                                        6 
##                       # Children at Home 
##                                        0 
##                             Years on Job 
##                                      454 
##                                   Income 
##                                      445 
##                            Single Parent 
##                                        0 
##                               Home Value 
##                                      464 
##                           Marital Status 
##                                        0 
##                                   Gender 
##                                        0 
##                      Max Education Level 
##                                        0 
##                         Distance to Work 
##                                        0 
##                              Vehicle Use 
##                                        0 
##                        Value of Vehicle  
##                                        0 
##                            Time in Force 
##                                        0 
##                              Type of Car 
##                                        0 
##                                A Red Car 
##                                        0 
##              Total Claims (Past 5 Years) 
##                                        0 
##                  # Claims (Past 5 Years) 
##                                        0 
##           License Revoked (Past 7 Years) 
##                                        0 
##              Motor Vehicle Record Points 
##                                        0 
##                              Vehicle Age 
##                                      510

TrainNoNA <- na.omit(Train)

colSums(is.na(TrainNoNA))

##                  Identification Variable 
##                                        0 
##                       Was Car in a crash 
##                                        0 
## If car was in a crash, what was the cost 
##                                        0 
##                       # Driving Children 
##                                        0 
##                            Age of Driver 
##                                        0 
##                       # Children at Home 
##                                        0 
##                             Years on Job 
##                                        0 
##                                   Income 
##                                        0 
##                            Single Parent 
##                                        0 
##                               Home Value 
##                                        0 
##                           Marital Status 
##                                        0 
##                                   Gender 
##                                        0 
##                      Max Education Level 
##                                        0 
##                         Distance to Work 
##                                        0 
##                              Vehicle Use 
##                                        0 
##                        Value of Vehicle  
##                                        0 
##                            Time in Force 
##                                        0 
##                              Type of Car 
##                                        0 
##                                A Red Car 
##                                        0 
##              Total Claims (Past 5 Years) 
##                                        0 
##                  # Claims (Past 5 Years) 
##                                        0 
##           License Revoked (Past 7 Years) 
##                                        0 
##              Motor Vehicle Record Points 
##                                        0 
##                              Vehicle Age 
##                                        0

TrainNoNA$`Age of Driver` <- cut(TrainNoNA$`Age of Driver`, breaks = c(0, 18, 30, 40, 50, 60, Inf),
 
                       
                                             labels = c("0-18", "18-30", "30-40", "40-50", "50-60", ">60"))

f1 <- as.formula("`Max Education Level` ~ .")
f2 <- as.formula("`Type of Car` ~ .")

dummy1 <- dummyVars(f1, data = TrainNoNA)
TrainNoNA <- predict(dummy1, newdata = TrainNoNA)
TrainNoNA <- as.data.frame(TrainNoNA)
TrainC <- TrainNoNA[,-1]

min_value <- min(TrainC$`Was Car in a crash`, TrainC$Income, TrainC$`Home Value`)
constant <- min_value + 1

BC <- powerTransform(object = cbind(TrainC$`Was Car in a crash` + constant,
                                     TrainC$Income + constant,
                                     TrainC$`Home Value` + constant)
                     )

testTransform(object = BC, lambda = BC$lambda)

##                          LRT df pval
## LR test, lambda = (0.43)   0  1    1

CarCrash_transformed  <- TrainC$`Was Car in a crash`^BC$lambda[1]
CarCrash_transformed

## numeric(0)

Income_transformed  <- TrainC$Income^BC$lambda[1]
Homevalue_transformed  <- TrainC$`Home Value`^BC$lambda[1]

train_corr <- subset(TrainNoNA, select = -c(TrainNoNA$`\`Identification Variable\``))

corrplot(cor(train_corr, use = "na.or.complete"), 
         type = "lower", 
         method = "square"
         )

stargazer(TrainC,
          type = "text"
         )

## 
## =========================================================================================
## Statistic                                    N      Mean      St. Dev.    Min     Max    
## -----------------------------------------------------------------------------------------
## `Was Car in a crash`                       6,448    0.264       0.441      0       1     
## `If car was in a crash, what was the cost` 6,448  1,497.018   4,665.921  0.000 85,523.650
## `# Driving Children`                       6,448    0.169       0.509      0       4     
## `Age of Driver`0-18                        6,448    0.001       0.028      0       1     
## `Age of Driver`18-30                       6,448    0.051       0.220      0       1     
## `Age of Driver`30-40                       6,448    0.265       0.441      0       1     
## `Age of Driver`40-50                       6,448    0.423       0.494      0       1     
## `Age of Driver`50-60                       6,448    0.226       0.418      0       1     
## `Age of Driver`> 60                        6,448    0.034       0.181      0       1     
## `# Children at Home`                       6,448    0.725       1.122      0       5     
## `Years on Job`                             6,448   10.539       4.068      0       23    
## Income                                     6,448 62,019.480  47,430.270    0    367,030  
## `Single Parent`                            6,448    0.133       0.340      0       1     
## `Home Value`                               6,448 155,438.200 129,655.300   0    885,282  
## `Marital Status`                           6,448    0.594       0.491      0       1     
## Gender                                     6,448    0.465       0.499      0       1     
## `Distance to Work`                         6,448   33.662      15.845      5      142    
## `Vehicle Use`                              6,448    0.627       0.484      0       1     
## `Value of Vehicle `                        6,448 15,760.730   8,434.251  1,500   69,740  
## `Time in Force`                            6,448    5.377       4.149      1       25    
## `Type of Car`Minivan                       6,448    0.266       0.442      0       1     
## `Type of Car`Panel Truck                   6,448    0.082       0.274      0       1     
## `Type of Car`Pickup                        6,448    0.172       0.378      0       1     
## `Type of Car`Sports Car                    6,448    0.112       0.315      0       1     
## `Type of Car`Van                           6,448    0.089       0.285      0       1     
## `Type of Car`z_SUV                         6,448    0.279       0.448      0       1     
## `A Red Car`                                6,448    0.292       0.455      0       1     
## `Total Claims (Past 5 Years)`              6,448  4,046.429   8,847.249    0     57,037  
## `# Claims (Past 5 Years)`                  6,448    0.795       1.162      0       5     
## `License Revoked (Past 7 Years)`           6,448    0.123       0.328      0       1     
## `Motor Vehicle Record Points`              6,448    1.705       2.159      0       13    
## `Vehicle Age`                              6,448    8.296       5.715     -3       28    
## -----------------------------------------------------------------------------------------

The mean income is $62,019 with a standard deviation of $47,430. The proportion of cars that were in a crash is 0.264, and the average cost of a crash is $1,497. The majority of drivers are married (59.4%) and have at least one child (72.5%). I use bucketing for age variable so that I can interact with it easier. Meanwhile I split the type of cars to make it clearer. Because education level has more than 3 characters so I concluded them into dummy variable for better vision. In the corrlation graph we can see there is a strong correlatrion between driver age between 50-60 and driver age between 40-50. Because home value and car value are not normally distributed, I use box cox to transform them for convenience.

III. Modeling

LinearM <- lm(data = TrainC,
             TrainC$`\`Was Car in a crash\``~ .
             )

LogiModel1 <- glm(formula = TrainC$`\`Was Car in a crash\`` ~ TrainC$`\`Distance to Work\`` + TrainC$`\`Vehicle Use\``, 
                   data    = TrainC, 
                 family = binomial(link = "logit")
             )

LogiModel2 <- glm(formula = TrainC$`\`Was Car in a crash\`` ~ TrainC$`\`Total Claims (Past 5 Years)\`` + TrainC$`\`License Revoked (Past 7 Years)\``, 
                   data    = TrainC, 
                 family = binomial(link = "logit")
             )

MatrixT <- as.matrix(TrainC)

X <- MatrixT[, -3]
Y <- TrainC$`\`Was Car in a crash\``

ridge_model <- glmnet(X, Y, alpha = 0)

lasso_model <- glmnet(X, Y, alpha = 1)

stargazer(LogiModel1, LogiModel2, LinearM,
          type = 'text' )

## 
## =============================================================================================
##                                                            Dependent variable:               
##                                              ------------------------------------------------
##                                                           ``Was Car in a crash``             
##                                                    logistic                   OLS            
##                                                 (1)        (2)                (3)            
## ---------------------------------------------------------------------------------------------
## ``Distance to Work``                          0.007***                                       
##                                               (0.002)                                        
##                                                                                              
## ``Vehicle Use``                              -0.661***                                       
##                                               (0.058)                                        
##                                                                                              
## ``Total Claims (Past 5 Years)``                         0.00002***                           
##                                                         (0.00000)                            
##                                                                                              
## ``License Revoked (Past 7 Years)``                       0.654***                            
##                                                          (0.088)                             
##                                                                                              
## ``If car was in a crash, what was the cost``                               0.00004***        
##                                                                            (0.00000)         
##                                                                                              
## ``# Driving Children``                                                      0.045***         
##                                                                             (0.010)          
##                                                                                              
## ``Age of Driver`0-18`                                                        -0.148          
##                                                                             (0.158)          
##                                                                                              
## ``Age of Driver`18-30`                                                       0.045           
##                                                                             (0.033)          
##                                                                                              
## ``Age of Driver`30-40`                                                       -0.039          
##                                                                             (0.027)          
##                                                                                              
## ``Age of Driver`40-50`                                                     -0.078***         
##                                                                             (0.026)          
##                                                                                              
## ``Age of Driver`50-60`                                                      -0.047*          
##                                                                             (0.027)          
##                                                                                              
## ``Age of Driver`> 60`                                                                        
##                                                                                              
##                                                                                              
## ``# Children at Home``                                                       -0.002          
##                                                                             (0.005)          
##                                                                                              
## ``Years on Job``                                                             -0.001          
##                                                                             (0.001)          
##                                                                                              
## Income                                                                      -0.00000         
##                                                                            (0.00000)         
##                                                                                              
## ``Single Parent``                                                           0.057***         
##                                                                             (0.017)          
##                                                                                              
## ``Home Value``                                                            -0.00000***        
##                                                                            (0.00000)         
##                                                                                              
## ``Marital Status``                                                          -0.029**         
##                                                                             (0.013)          
##                                                                                              
## Gender                                                                       -0.003          
##                                                                             (0.016)          
##                                                                                              
## ``Distance to Work``                                                        0.001***         
##                                                                             (0.0003)         
##                                                                                              
## ``Vehicle Use``                                                            -0.093***         
##                                                                             (0.011)          
##                                                                                              
## ``Value of Vehicle ``                                                     -0.00000***        
##                                                                            (0.00000)         
##                                                                                              
## ``Time in Force``                                                          -0.005***         
##                                                                             (0.001)          
##                                                                                              
## ``Type of Car`Minivan`                                                     -0.055***         
##                                                                             (0.016)          
##                                                                                              
## ``Type of Car`Panel Truck`                                                   -0.014          
##                                                                             (0.028)          
##                                                                                              
## ``Type of Car`Pickup`                                                        -0.013          
##                                                                             (0.017)          
##                                                                                              
## ``Type of Car`Sports Car`                                                    0.017           
##                                                                             (0.016)          
##                                                                                              
## ``Type of Car`Van`                                                           0.001           
##                                                                             (0.023)          
##                                                                                              
## ``Type of Car`z_SUV`                                                                         
##                                                                                              
##                                                                                              
## ``A Red Car``                                                                -0.014          
##                                                                             (0.013)          
##                                                                                              
## ``Total Claims (Past 5 Years)``                                           -0.00000***        
##                                                                            (0.00000)         
##                                                                                              
## ``# Claims (Past 5 Years)``                                                 0.043***         
##                                                                             (0.005)          
##                                                                                              
## ``License Revoked (Past 7 Years)``                                          0.144***         
##                                                                             (0.015)          
##                                                                                              
## ``Motor Vehicle Record Points``                                             0.018***         
##                                                                             (0.002)          
##                                                                                              
## ``Vehicle Age``                                                             -0.002**         
##                                                                             (0.001)          
##                                                                                              
## Constant                                     -0.884***  -1.211***           0.377***         
##                                               (0.075)    (0.033)            (0.037)          
##                                                                                              
## ---------------------------------------------------------------------------------------------
## Observations                                   6,448      6,448              6,448           
## R2                                                                           0.380           
## Adjusted R2                                                                  0.377           
## Log Likelihood                               -3,647.764 -3,638.634                           
## Akaike Inf. Crit.                            7,301.528  7,283.269                            
## Residual Std. Error                                                    0.348 (df = 6418)     
## F Statistic                                                        135.689*** (df = 29; 6418)
## =============================================================================================
## Note:                                                             *p<0.1; **p<0.05; ***p<0.01

summary(ridge_model)

##           Length Class     Mode   
## a0         100   -none-    numeric
## beta      3100   dgCMatrix S4     
## df         100   -none-    numeric
## dim          2   -none-    numeric
## lambda     100   -none-    numeric
## dev.ratio  100   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric

summary(lasso_model)

##           Length Class     Mode   
## a0          39   -none-    numeric
## beta      1209   dgCMatrix S4     
## df          39   -none-    numeric
## dim          2   -none-    numeric
## lambda      39   -none-    numeric
## dev.ratio   39   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric

In this section, I did one OLS for all the variable except the car crash which is my dependent variables. I chose distance to work and car use for my first logistic regression because I think those varible should have a great impact on the chance of crashing a car. For my second logistic regression I chose the number of claims and time licence got revoked. This is because those shows the drivers’ history and can let us have some insight on their driving behavior. In the summary, we can see the number of driving children, number of claims, license revoked and single parent is relative with the chance of crashing a car. The license Revoked in the past 7 years stands out the most that it when a driver has record of license got revoked, the change of crashing a car increase by 14.4%.

IV. Model Selection

AIC(LinearM)

## [1] 4715.215

plot(ridge_model, xvar = "lambda", label = TRUE,
title("Ridge Regression Coefficients"))

plot(lasso_model, xvar = "lambda", label = TRUE,
title("Lasso Regression Coefficients"))

## Warning in plotCoef(x$beta, lambda = x$lambda, df = x$df, dev = x$dev.ratio, :
## 1 or less nonzero coefficients; glmnet plot is not meaningful

plot(LinearM)

lp1<- predict(LogiModel1, TrainC, type = "response")

logistic_binary_pred1 <- ifelse(lp1 > 0.5, 1, 0)

logistic_mae1 <- mae(TrainC$`\`Was Car in a crash\``, logistic_binary_pred1)

r2forLoM1 <- cor(TrainC$`\`Was Car in a crash\``, lp1)^2

cat("Logistic MAE:", logistic_mae1, "R2:", r2forLoM1)

## Logistic MAE: 0.264268 R2: 0.02266993

lp2<- predict(LogiModel2, TrainC, type = "response")

logistic_binary_pred2 <- ifelse(lp2 > 0.5, 1, 0)

logistic_mae2 <- mae(TrainC$`\`Was Car in a crash\``, logistic_binary_pred2)

r2forLoM2 <- cor(TrainC$`\`Was Car in a crash\``, lp2)^2

cat("Logistic MAE:", logistic_mae2, "R2:", r2forLoM2)

## Logistic MAE: 0.2608561 R2: 0.02755773

In my model selection, I will use the OLS model because it has much lower AIC, higer R2 and higher MSE than both of my logistic regression. Although the residuals vs Fitted and Scale-Location graph is not good and violate the Gauss-Markov Assumption, but that is because I’m actually testing an binary variables which is not always the situation. Also, the linear model shows the whole picture other just focusing on some specific variables.

```

Insurance

Yixuan Huang

2023-04-25

I. Data Exploration

II. Data Prep

III. Modeling

IV. Model Selection