Esame 18 novembre 2002

Lettura dei dati e considerazioni preliminari

source("/media/sally/STORE N GO/data mining/Lab 2013-2014/funzioniVarie.R")

ass <- read.table("/media/sally/STORE N GO/data mining/Esami/T68-1.DAT", header = T)
colnames(ass) <- c("Km.anno", "Zona", "Categoria", "Auto.tipo", "Ntot", "Nsinistri" , "Costo", "V8", "V9" ) 

str(ass)

## 'data.frame':    2182 obs. of  9 variables:
##  $ Km.anno  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Zona     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Categoria: int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Auto.tipo: int  1 2 3 4 5 6 7 8 9 1 ...
##  $ Ntot     : num  455.1 69.2 72.9 1292.4 191 ...
##  $ Nsinistri: int  108 19 13 124 40 57 23 14 1704 45 ...
##  $ Costo    : int  392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
##  $ V8       : num  5.07e+09 2.01e+08 2.53e+07 6.80e+09 1.36e+09 ...
##  $ V9       : num  862 668 215 327 625 ...

#--------------------#
# Codifica variabili 
#--------------------#

# Km.anno è una discretizzazione di qualcosa di continuo.. lascio così

ass$Zona <- factor(ass$Zona)
# Rinomino i livelli
levels(ass$Zona) <- c("metro", "metro", "città", "città", "rurale", "rurale", "rurale")

ass$Auto.tipo <- factor(ass$Auto.tipo)
# ass$Categoria <- factor(ass$Categoria, ordered = T)

# Check per NA dim(ass[is.na(ass), ])[1]

# Creazione variabile risposta : Costo medio
ass$costo.medio <- ass$Costo/ass$Nsinistri
ass$costo.medio[ass$Nsinistri == 0] <- 0

#==================================================#
# Dovrebbe aver senso la trasformata logaritmica (tipo esempio visto a lezione)
ass$costo.medio.log<- pmax(log(ass$costo.medio),0)
# e quindi anche fare due modelli, prima logistico e poi il resto (no voglia)
#==================================================#

# Sicuramente non mi serve il costo totale, V8/V9 non so che cosa sono
# Potrei pensare di metterle nel modello e vedere che succede, ma poi tanto non saprei interpretarle, quindi via
# il N. di sinistri lo uso per costruire la risposta 

# names(ass)[-c(6:9)]
ass <- ass[, -c(6:9)]

Stima Verifica

perc = 0.7
set.seed(42)
indici <- sample(1:nrow(ass), round(nrow(ass)*perc))

train <- ass[indici, ]
test <- ass[-indici, ]

X.train <- train[, -c(6:7)]
X.test <- test[, -c(6:7)]

mat.train <- model.matrix(~. - 1, data = X.train)
mat.test <- model.matrix(~. - 1, data = X.test)

y.train <- train$costo.medio
y.test <- test$costo.medio

y.train.log <- train$costo.medio.log
y.test.log <- test$costo.medio.log

confronto <- matrix(NA, nrow = 1 , ncol =4, dimnames = list(c(), c("model", "est. scale", "err.original", "err.log")))
err <- function(prediction) errori(prediction, actual= test$costo.medio, log = T, actual.log= test$costo.medio.log)

Modelli lineari

# Regressione lineare
fit.lm <- lm(y.train ~.,data = X.train )
# summary(fit.lm) #bel
pred <- predict(fit.lm, newdata= X.test)
confronto[1, ] <- cbind("linear", "original",  err(pred))

# Posso togliere Ntot, ma non è che tanto migliorerebbe molto 

# un po' meglio con la trasformata
fit.lm <- lm(y.train.log ~.,data = X.train )
summary(fit.lm)

## 
## Call:
## lm(formula = y.train.log ~ ., data = X.train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.019 -0.821  0.530  1.817  7.225 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.13e+00   3.18e-01   28.74  < 2e-16 ***
## Km.anno     -4.01e-01   5.14e-02   -7.80  1.1e-14 ***
## Zonacittà    2.36e-01   1.91e-01    1.24   0.2164    
## Zonarurale  -2.45e+00   1.74e-01  -14.09  < 2e-16 ***
## Categoria    2.01e-01   3.62e-02    5.56  3.1e-08 ***
## Auto.tipo2  -9.53e-01   3.11e-01   -3.06   0.0022 ** 
## Auto.tipo3  -1.75e+00   3.05e-01   -5.75  1.1e-08 ***
## Auto.tipo4  -2.31e+00   3.11e-01   -7.45  1.6e-13 ***
## Auto.tipo5  -8.29e-01   2.99e-01   -2.78   0.0055 ** 
## Auto.tipo6  -4.99e-01   3.01e-01   -1.65   0.0982 .  
## Auto.tipo7  -1.15e+00   3.10e-01   -3.73   0.0002 ***
## Auto.tipo8  -1.71e+00   3.06e-01   -5.59  2.7e-08 ***
## Auto.tipo9   9.73e-01   3.17e-01    3.07   0.0022 ** 
## Ntot        -3.52e-05   1.29e-05   -2.73   0.0064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.81 on 1513 degrees of freedom
## Multiple R-squared:  0.256,  Adjusted R-squared:  0.25 
## F-statistic: 40.1 on 13 and 1513 DF,  p-value: <2e-16

pred <- pmax(predict(fit.lm, newdata= X.test ), 0.5)
confronto <- rbind(confronto, cbind("linear", "log",  err(exp(pred)) ))

# Lasso
library(lars)
fit.lasso <- lars(mat.train, y.train, type = "lasso") 

p.lasso <- predict.lars(fit.lasso, newx = mat.test )
matrice.err<-tasso.lars(p.lasso$fit, y.test)
J <- matrice.err[matrice.err[,2]==min(matrice.err[,2], na.rm=T),1]
# Coefficienti
beta.lasso <- coef(fit.lasso)[J,] 
# Possiamo vedere quali coefficienti sono finiti a zero
varname<-attr(mat.train, "dimnames")[[2]] 
varname[beta.lasso==0]

## [1] "Zonametro"  "Auto.tipo5" "Auto.tipo6" "Ntot"

# e quali no
varname[beta.lasso!=0]

##  [1] "Km.anno"    "Zonacittà"  "Zonarurale" "Categoria"  "Auto.tipo2"
##  [6] "Auto.tipo3" "Auto.tipo4" "Auto.tipo7" "Auto.tipo8" "Auto.tipo9"

pred.lasso1<-  pmax(p.lasso$fit[,J],0.5)

confronto <- rbind(confronto, c("lasso","original" , err(pred.lasso1)))

Modello additivo

library(gam)

fit.gam <- step.gam(gam(y.train~1, data=X.train), scope=gam.scope(X.train))

## Start:  y.train ~ 1; AIC= 30004 
## Step:1 y.train ~ Zona ; AIC= 29986 
## Step:2 y.train ~ Zona + Auto.tipo ; AIC= 29981 
## Step:3 y.train ~ Zona + Categoria + Auto.tipo ; AIC= 29978 
## Step:4 y.train ~ Zona + s(Categoria) + Auto.tipo ; AIC= 29974

summary(fit.gam)

## 
## Call: gam(formula = y.train ~ Zona + s(Categoria) + Auto.tipo, data = X.train, 
##     trace = FALSE)
## Deviance Residuals:
##    Min     1Q Median     3Q    Max 
##  -5243  -2406   -847    931  29181 
## 
## (Dispersion Parameter for gaussian family taken to be 19392073)
## 
##     Null Deviance: 3.046e+10 on 1526 degrees of freedom
## Residual Deviance: 2.932e+10 on 1512 degrees of freedom
## AIC: 29974 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                Df   Sum Sq  Mean Sq F value  Pr(>F)    
## Zona            2 4.28e+08 2.14e+08   11.04 1.7e-05 ***
## s(Categoria)    1 9.78e+07 9.78e+07    5.04  0.0249 *  
## Auto.tipo       8 4.21e+08 5.26e+07    2.71  0.0057 ** 
## Residuals    1512 2.93e+10 1.94e+07                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##              Npar Df Npar F Pr(F)  
## (Intercept)                        
## Zona                               
## s(Categoria)       3   3.21 0.022 *
## Auto.tipo                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(fit.gam)

plot of chunk unnamed-chunk-4

pred.gam<-pmax(predict(fit.gam, X.test),0.5)

confronto <- rbind(confronto, c("gam","original" , err(pred.gam)))

Mars

library(polspline)

fit.mars <- polymars(y.train, X.train, factors = c(2, 4)) #bel
# summary(fit.mars)

# Con trasformata logaritmica
fit.mars.log <- polymars(y.train.log, X.train, factors = c(2, 4))
# summary(fit.mars.log)

tab <- fit.mars.log$model
tab$pred1 <- c("costante", names(X.train)[2], names(X.train)[1], names(X.train)[3], names(X.train)[5] ,
               names(X.train)[5], names(X.train)[1], names(X.train)[4])
tab

##       pred1 knot1 level1 pred2 knot2    coefs       SE
## 1  costante    NA     NA     0    NA  1.09138 0.370211
## 2      Zona    NA      3     0    NA -0.37394 0.129436
## 3   Km.anno    NA     NA     0    NA  0.51788 0.094051
## 4 Categoria    NA     NA     0    NA  0.04541 0.065066
## 5      Ntot    NA     NA     0    NA  0.13117 0.003842
## 6      Ntot 52.25     NA     0    NA -0.13116 0.003842
## 7   Km.anno    NA     NA     3    NA -0.08048 0.020082
## 8 Auto.tipo    NA      8     0    NA  0.65313 0.184836

pred.mars <- predict(fit.mars.log, X.test)

confronto <- rbind(confronto, c("mars","log" , err(exp(pred.mars))))

Albero di regressione

#-----------------------------------------------------------------------#
# Alberi (rpart con cross.validation..)
library(rpart)

fit.tree <- rpart(y.train ~., data = X.train, control = rpart.control(xval = 100))
#plot(fit.tree)
#text(fit.tree, cex = 0.7)

pred.tree <- predict(fit.tree, X.test)
confronto <- rbind(confronto, c("tree","original" , err(pred.tree)))

fit.tree.log <- rpart(y.train.log ~., data = X.train, control = rpart.control(xval = 100))
#plot(fit.tree.log)
#text(fit.tree.log, cex = 0.7)

pred.tree.log <- predict(fit.tree.log, X.test)
confronto <- rbind(confronto, c("tree","log" , err(exp(pred.tree.log))))

SVM

# SVM

library(e1071)

# Inutilità (cv per scelta del kernel)
# svm.radial <- svm(y.train ~., data = X.train, cross = 10, seed = 42)
# svm.linear <- svm(y.train ~., data = X.train, kernel = "linear", cross = 10, seed = 42)
svm.polynomial <- svm(y.train ~., data = X.train, kernel = "polynomial", cross = 10, seed = 42)
#svm.sigmoid <- svm(y.train ~., data = X.train, kernel = "sigmoid", cross = 10, seed = 42)

#c(svm.radial$tot.MSE ,svm.linear$tot.MSE, svm.polynomial$tot.MSE, svm.sigmoid$tot.MSE)

pred.svm <- pmax(predict(svm.polynomial, newdata = X.test, decision.values = TRUE), 0.5)

confronto <- rbind(confronto, c("svm","original" , err(pred.svm)))

# tanto vale farlo in scala log, l'interpretabilità è lasciata a se stessa
svm.radial <- svm(y.train.log ~., data = X.train, cross = 10, seed = 42)
# svm.linear <- svm(y.train.log ~., data = X.train, kernel = "linear", cross = 10, seed = 42)
# svm.polynomial <- svm(y.train.log ~., data = X.train, kernel = "polynomial", cross = 10, seed = 42)
# svm.sigmoid <- svm(y.train.log ~., data = X.train, kernel = "sigmoid", cross = 10, seed = 42)
# 
# c(svm.radial$tot.MSE ,svm.linear$tot.MSE, svm.polynomial$tot.MSE, svm.sigmoid$tot.MSE)

pred.svm.log <- pmax(predict(svm.radial, newdata = X.test, decision.values = TRUE), 0.5)
confronto <- rbind(confronto, c("svm","log" , err(exp(pred.svm))))

Random Forest

library (randomForest)

set.seed(42)

fit.rf2 <- randomForest(y.train ~. , data =X.train, nodesize=1, ntree = 1000, mtry=2)
plot(fit.rf2)

plot of chunk unnamed-chunk-8

pred.rf2 <-predict(fit.rf2, X.test)
confronto <- rbind(confronto, c("rf", "original",  err(pred.rf2)))

fit.rf.log <- randomForest(y.train.log ~. , data =X.train, nodesize=1, ntree = 1000, mtry=2)
plot(fit.rf.log)

plot of chunk unnamed-chunk-8

pred.rf.log <-predict(fit.rf.log, X.test)

confronto <- rbind(confronto, c("rf", "log",  err(exp(pred.rf.log))))

Bagging

library(ipred)

fit.bag<-bagging(y.train~., data= X.train ,nbagg=1000,coob=T) 

pred.bag<-pmax(predict(fit.bag, X.test),0.5)
err.bag <- err(pred.bag)
confronto <- rbind(confronto , c("bagging","original",  err.bag))

Confronto finale

model est.	scale	err.original	err.log
linear	log	32506279866	5056
mars	log	19646000857	3314
tree	log	16289580975	3383
svm	log	16515165777	5069
rf	log	16411400209	3139
nnet2	log	14067478476	7115
tree	orginal	14013307534	6515
linear	original	14511112998	7698
gam	original	14549665151	7738
svm	original	15304393684	6939
rf	original	14117301154	6259
bagging	original	13642456678	6607
nnet1	original	14913137077	8166

Scelta del modello? dipende se mi interessa interpretarlo.. cioè se l'obiettivo è solo predizione prendo il migliore (foreste casuali/bagging), anche perchè tanto le variabili sono anche poche. E a questo punto mi richiedo se potevo usare anche V8 e V9 che ho buttato, ma a pelle direi che è meglio di no, potrebbero essere qualsiasi cosa.