Homework DataMining

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(nasa_nb)

##  Neo.Reference.ID       Name         Absolute.Magnitude Est.Dia.in.KM.min. 
##  Min.   :2000433   Min.   :2000433   Min.   :11.16      Min.   : 0.001011  
##  1st Qu.:3097594   1st Qu.:3097594   1st Qu.:20.10      1st Qu.: 0.033462  
##  Median :3514799   Median :3514799   Median :21.90      Median : 0.110804  
##  Mean   :3272298   Mean   :3272298   Mean   :22.27      Mean   : 0.204604  
##  3rd Qu.:3690060   3rd Qu.:3690060   3rd Qu.:24.50      3rd Qu.: 0.253837  
##  Max.   :3781897   Max.   :3781897   Max.   :32.10      Max.   :15.579552  
##  Est.Dia.in.KM.max. Est.Dia.in.M.min.   Est.Dia.in.M.max. 
##  Min.   : 0.00226   Min.   :    1.011   Min.   :    2.26  
##  1st Qu.: 0.07482   1st Qu.:   33.462   1st Qu.:   74.82  
##  Median : 0.24777   Median :  110.804   Median :  247.77  
##  Mean   : 0.45751   Mean   :  204.604   Mean   :  457.51  
##  3rd Qu.: 0.56760   3rd Qu.:  253.837   3rd Qu.:  567.60  
##  Max.   :34.83694   Max.   :15579.552   Max.   :34836.94  
##  Est.Dia.in.Miles.min. Est.Dia.in.Miles.max. Est.Dia.in.Feet.min.
##  Min.   :0.000628      Min.   : 0.001404     Min.   :    3.32    
##  1st Qu.:0.020792      1st Qu.: 0.046493     1st Qu.:  109.78    
##  Median :0.068850      Median : 0.153954     Median :  363.53    
##  Mean   :0.127135      Mean   : 0.284283     Mean   :  671.27    
##  3rd Qu.:0.157727      3rd Qu.: 0.352688     3rd Qu.:  832.80    
##  Max.   :9.680682      Max.   :21.646663     Max.   :51114.02    
##  Est.Dia.in.Feet.max. Close.Approach.Date Epoch.Date.Close.Approach
##  Min.   :     7.41    Length:4687         Min.   :7.889e+11        
##  1st Qu.:   245.49    Class :character    1st Qu.:1.016e+12        
##  Median :   812.88    Mode  :character    Median :1.203e+12        
##  Mean   :  1501.01                        Mean   :1.180e+12        
##  3rd Qu.:  1862.19                        3rd Qu.:1.356e+12        
##  Max.   :114294.42                        Max.   :1.473e+12        
##  Relative.Velocity.km.per.sec Relative.Velocity.km.per.hr Miles.per.hour   
##  Min.   : 0.3355              Min.   :  1208              Min.   :  750.5  
##  1st Qu.: 8.4329              1st Qu.: 30358              1st Qu.:18863.5  
##  Median :12.9179              Median : 46504              Median :28896.0  
##  Mean   :13.9708              Mean   : 50295              Mean   :31251.3  
##  3rd Qu.:18.0776              3rd Qu.: 65080              3rd Qu.:40437.9  
##  Max.   :44.6337              Max.   :160681              Max.   :99841.2  
##  Miss.Dist..Astronomical. Miss.Dist..lunar.   Miss.Dist..kilometers.
##  Min.   :0.0001779        Min.   :  0.06919   Min.   :   26610      
##  1st Qu.:0.1334196        1st Qu.: 51.90021   1st Qu.:19959283      
##  Median :0.2650286        Median :103.09612   Median :39647712      
##  Mean   :0.2567782        Mean   : 99.88671   Mean   :38413467      
##  3rd Qu.:0.3841541        3rd Qu.:149.43592   3rd Qu.:57468628      
##  Max.   :0.4998841        Max.   :194.45491   Max.   :74781600      
##  Miss.Dist..miles.  Orbiting.Body         Orbit.ID     Orbit.Determination.Date
##  Min.   :   16535   Length:4687        Min.   :  1.0   Length:4687             
##  1st Qu.:12402124   Class :character   1st Qu.:  9.0   Class :character        
##  Median :24635948   Mode  :character   Median : 16.0   Mode  :character        
##  Mean   :23869022                      Mean   : 28.3                           
##  3rd Qu.:35709350                      3rd Qu.: 31.0                           
##  Max.   :46467132                      Max.   :611.0                           
##  Orbit.Uncertainity Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant
##  Min.   :0.000      Min.   :0.0000021          Min.   :2.196              
##  1st Qu.:0.000      1st Qu.:0.0145851          1st Qu.:4.050              
##  Median :3.000      Median :0.0473655          Median :5.071              
##  Mean   :3.517      Mean   :0.0823201          Mean   :5.056              
##  3rd Qu.:6.000      3rd Qu.:0.1235935          3rd Qu.:6.019              
##  Max.   :9.000      Max.   :0.4778910          Max.   :9.025              
##  Epoch.Osculation   Eccentricity      Semi.Major.Axis   Inclination      
##  Min.   :2450165   Min.   :0.007522   Min.   :0.6159   Min.   : 0.01451  
##  1st Qu.:2458001   1st Qu.:0.240858   1st Qu.:1.0006   1st Qu.: 4.96234  
##  Median :2458001   Median :0.372450   Median :1.2410   Median :10.31184  
##  Mean   :2457724   Mean   :0.382569   Mean   :1.4003   Mean   :13.37384  
##  3rd Qu.:2458001   3rd Qu.:0.512411   3rd Qu.:1.6784   3rd Qu.:19.51168  
##  Max.   :2458020   Max.   :0.960261   Max.   :5.0720   Max.   :75.40667  
##  Asc.Node.Longitude Orbital.Period   Perihelion.Distance Perihelion.Arg    
##  Min.   :  0.0019   Min.   : 176.6   Min.   :0.08074     Min.   :  0.0069  
##  1st Qu.: 83.0812   1st Qu.: 365.6   1st Qu.:0.63083     1st Qu.: 95.6259  
##  Median :172.6254   Median : 504.9   Median :0.83315     Median :189.7616  
##  Mean   :172.1573   Mean   : 635.6   Mean   :0.81338     Mean   :183.9322  
##  3rd Qu.:255.0269   3rd Qu.: 794.2   3rd Qu.:0.99723     3rd Qu.:271.7776  
##  Max.   :359.9059   Max.   :4172.2   Max.   :1.29983     Max.   :359.9931  
##  Aphelion.Dist    Perihelion.Time    Mean.Anomaly       Mean.Motion     
##  Min.   :0.8038   Min.   :2450100   Min.   :  0.0032   Min.   :0.08628  
##  1st Qu.:1.2661   1st Qu.:2457815   1st Qu.: 87.0069   1st Qu.:0.45329  
##  Median :1.6182   Median :2457973   Median :185.7189   Median :0.71295  
##  Mean   :1.9871   Mean   :2457728   Mean   :181.1679   Mean   :0.73824  
##  3rd Qu.:2.4512   3rd Qu.:2458108   3rd Qu.:276.5319   3rd Qu.:0.98467  
##  Max.   :8.9839   Max.   :2458839   Max.   :359.9180   Max.   :2.03900  
##    Equinox           Hazardous        
##  Length:4687        Length:4687       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

sum(is.na(nasa_nb))

## [1] 0

nasa_clean <- nasa_nb %>% select(-Neo.Reference.ID, -Name , -Close.Approach.Date, -Orbiting.Body,
                              -Orbit.ID, -Orbit.Determination.Date, -Equinox, 
                              -Asc.Node.Longitude, -Est.Dia.in.Feet.max., -Est.Dia.in.Feet.min.,
                              -Est.Dia.in.M.max., -Est.Dia.in.M.min., -Est.Dia.in.Miles.max.,
                              -Est.Dia.in.Miles.min., -Epoch.Date.Close.Approach, -Relative.Velocity.km.per.sec,
                              -Miss.Dist..Astronomical., -Miss.Dist..lunar., -Miss.Dist..miles.,
                              -Miles.per.hour, -Epoch.Osculation, -Perihelion.Time, -Orbital.Period) %>% 
                       mutate(Hazardous=ifelse(Hazardous=="True", 1,0)) %>% mutate(Hazardous=as.factor(Hazardous))

set.seed(123)
nasa_idx = sample(nrow(nasa_clean), nrow(nasa_clean)*0.8) 
# Training Set
nasa_trn = nasa_clean[nasa_idx, ]
# Test Set
nasa_tst = nasa_clean[-nasa_idx,] 

subtrain_index <- sample(nrow(nasa_trn), nrow(nasa_trn)*0.65)
# Subtraining
sub_training <- nasa_trn[subtrain_index, ]
# Validation
validation <- nasa_trn[-subtrain_index, ]

summary(sub_training)

##  Absolute.Magnitude Est.Dia.in.KM.min. Est.Dia.in.KM.max.
##  Min.   :13.92      Min.   :0.001011   Min.   :0.00226   
##  1st Qu.:20.10      1st Qu.:0.035039   1st Qu.:0.07835   
##  Median :21.90      Median :0.110804   Median :0.24777   
##  Mean   :22.28      Mean   :0.195312   Mean   :0.43673   
##  3rd Qu.:24.40      3rd Qu.:0.253837   3rd Qu.:0.56760   
##  Max.   :32.10      Max.   :4.370740   Max.   :9.77327   
##  Relative.Velocity.km.per.hr Miss.Dist..kilometers. Orbit.Uncertainity
##  Min.   :  1208              Min.   :   72172       Min.   :0.000     
##  1st Qu.: 30419              1st Qu.:20793231       1st Qu.:0.000     
##  Median : 45118              Median :39742096       Median :3.000     
##  Mean   : 49690              Mean   :38709324       Mean   :3.526     
##  3rd Qu.: 63670              3rd Qu.:57311427       3rd Qu.:6.000     
##  Max.   :153789              Max.   :74781600       Max.   :9.000     
##  Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant  Eccentricity     
##  Min.   :0.0000021          Min.   :2.240               Min.   :0.007522  
##  1st Qu.:0.0147658          1st Qu.:4.013               1st Qu.:0.244002  
##  Median :0.0485300          Median :5.063               Median :0.376130  
##  Mean   :0.0831235          Mean   :5.052               Mean   :0.381790  
##  3rd Qu.:0.1240818          3rd Qu.:6.030               3rd Qu.:0.506546  
##  Max.   :0.4778910          Max.   :9.025               Max.   :0.960261  
##  Semi.Major.Axis   Inclination       Perihelion.Distance Perihelion.Arg    
##  Min.   :0.6159   Min.   : 0.02099   Min.   :0.08074     Min.   :  0.4588  
##  1st Qu.:0.9997   1st Qu.: 4.91180   1st Qu.:0.62882     1st Qu.: 97.7800  
##  Median :1.2436   Median :10.24206   Median :0.83738     Median :193.2657  
##  Mean   :1.4049   Mean   :13.29716   Mean   :0.81655     Mean   :186.8172  
##  3rd Qu.:1.7033   3rd Qu.:19.23963   3rd Qu.:1.00094     3rd Qu.:276.5412  
##  Max.   :5.0720   Max.   :75.40667   Max.   :1.29983     Max.   :359.9298  
##  Aphelion.Dist     Mean.Anomaly       Mean.Motion      Hazardous
##  Min.   :0.8038   Min.   :  0.0032   Min.   :0.08628   0:2046   
##  1st Qu.:1.2667   1st Qu.: 86.8170   1st Qu.:0.44336   1: 390   
##  Median :1.6127   Median :184.0743   Median :0.71068            
##  Mean   :1.9932   Mean   :180.7745   Mean   :0.73691            
##  3rd Qu.:2.4738   3rd Qu.:276.6560   3rd Qu.:0.98603            
##  Max.   :8.9839   Max.   :359.9180   Max.   :2.03900

a <- round(prop.table(table(sub_training$Hazardous)),4)
b <- round(prop.table(table(validation$Hazardous)),4)
c <- round(prop.table(table(nasa_tst$Hazardous)),4)

dataset <- c(rep("Subtraining Set" , 2) , rep("Validation Set" , 2) , rep("Test Set" , 2))
esito <- rep(c("Non-Hazardous" , "Hazardous") , 3)
proporzione <- c(a,b,c)
data <- data.frame(dataset,esito,proporzione)

bar_plot <- ggplot(data, aes(fill=esito, y=proporzione, x=dataset)) + 
  geom_bar(position="fill", stat="identity") +
  scale_fill_manual(values = c("purple","green2")) +
  ggtitle("Proporzioni degli esiti")
ggplotly(bar_plot)

set.seed(123)
sub_training <- upSample(sub_training[,-17], sub_training[,17], yname = "Hazardous")
table(sub_training$Hazardous)

## 
##    0    1 
## 2046 2046

melted_data <- reshape2::melt(sub_training, id.vars = "Hazardous") %>% rename(Variabile=variable) 
# summary(melted_data)
ggplot(melted_data, aes( x="", y = value, fill = Variabile)) +
  geom_boxplot( outlier.shape="x", outlier.size=3, show.legend = F) +
  facet_wrap(~ Variabile, nrow = 5, scales = "free") +
  labs(title = "Boxplots per 12 Variabili", x = "") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank())

apply(sub_training[,-17], 2, skewness)

##          Absolute.Magnitude          Est.Dia.in.KM.min. 
##                  0.67329856                  4.84574954 
##          Est.Dia.in.KM.max. Relative.Velocity.km.per.hr 
##                  4.84574954                  0.90667778 
##      Miss.Dist..kilometers.          Orbit.Uncertainity 
##                 -0.12678447                  0.68956984 
##  Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant 
##                  2.23892229                  0.11079086 
##                Eccentricity             Semi.Major.Axis 
##                  0.22659508                  1.02578150 
##                 Inclination         Perihelion.Distance 
##                  1.45343450                 -0.10901073 
##              Perihelion.Arg               Aphelion.Dist 
##                 -0.11126068                  1.20952428 
##                Mean.Anomaly                 Mean.Motion 
##                 -0.07464279                  0.43142266

sub_trn_log <- sub_training %>% mutate(Est.Dia.in.KM.min.=log(Est.Dia.in.KM.min.),
                                       Est.Dia.in.KM.max.=log(Est.Dia.in.KM.max.),
                                       Minimum.Orbit.Intersection=log(Minimum.Orbit.Intersection),
                                       Inclination=log(Inclination))

#GRAFICO BOXPLOT POST TRASFORMAZIONI LOG
melted_data <- reshape2::melt(sub_trn_log, id.vars = "Hazardous") %>% rename(Variabile=variable) 
# summary(melted_data)
ggplot(melted_data, aes( x="", y = value, fill = Variabile)) +
  geom_boxplot( outlier.shape="x", outlier.size=3, show.legend = F) +
  facet_wrap(~ Variabile, nrow = 5, scales = "free") +
  labs(title = "Boxplots per 12 Variabili", x = "") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank())

apply(sub_trn_log[,-17], 2, skewness)

##          Absolute.Magnitude          Est.Dia.in.KM.min. 
##                  0.67329856                 -0.67329856 
##          Est.Dia.in.KM.max. Relative.Velocity.km.per.hr 
##                 -0.67329856                  0.90667778 
##      Miss.Dist..kilometers.          Orbit.Uncertainity 
##                 -0.12678447                  0.68956984 
##  Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant 
##                 -0.88291601                  0.11079086 
##                Eccentricity             Semi.Major.Axis 
##                  0.22659508                  1.02578150 
##                 Inclination         Perihelion.Distance 
##                 -0.73044257                 -0.10901073 
##              Perihelion.Arg               Aphelion.Dist 
##                 -0.11126068                  1.20952428 
##                Mean.Anomaly                 Mean.Motion 
##                 -0.07464279                  0.43142266

corr1 <- round(cor(sub_trn_log[,-c(17)]), 3)

corr_plot <- ggcorrplot(corr1, hc.order = TRUE, lab=T, lab_size = 4, type="lower", show.diag = T)
corr_plot

sub_trn_clean <- sub_trn_log %>% select(-Mean.Motion,-Semi.Major.Axis,-Aphelion.Dist,
                                         -Est.Dia.in.KM.min., -Est.Dia.in.KM.max.)

melted_data2 <- reshape2::melt(sub_trn_clean, id.vars = "Hazardous")

ggplot(melted_data2, aes(x = value)) +
  geom_density(aes(fill=factor(Hazardous)), alpha=0.5) +
  facet_wrap(~ variable, nrow = 4, scales = "free") +
  labs(title = "Distribuzioni di densità condizionatamente a Hazardous e Non-Hazardous", x = "Valore", y = "Densità") +
  scale_fill_manual(values = c("green2","purple"), name = "Esito", labels = c("Non-Hazardous",  "Hazardous")) +
  theme_minimal()

final_sub_trn <- sub_trn_clean %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                          Minimum.Orbit.Intersection,Hazardous)

#### Normalizzazione Subtraining (con trasformazioni effettuate) ####

matrix_indicators <- matrix(0, nrow=(dim(final_sub_trn)[2]-1), ncol=2) 
colnames(matrix_indicators) <- c("min", "max")

# calcolo min e max.
for (i in 1:(dim(final_sub_trn)[2]-1)){
  matrix_indicators[i, "min"] <- min(final_sub_trn[,i])
  matrix_indicators[i, "max"] <- max(final_sub_trn[,i])
}

# normalizzazione
for (i in 1:(dim(final_sub_trn)[2]-1)){
  final_sub_trn[, i] <- (final_sub_trn[, i] - matrix_indicators[i, "min"])/(matrix_indicators[i, "max"]- matrix_indicators[i, "min"])
}

summary(final_sub_trn)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :0.0000     Min.   :0.0000     Min.   :0.0000             0:2046   
##  1st Qu.:0.3179     1st Qu.:0.0000     1st Qu.:0.7080             1:2046   
##  Median :0.3949     Median :0.1111     Median :0.7778                      
##  Mean   :0.4134     Mean   :0.2878     Mean   :0.7638                      
##  3rd Qu.:0.4829     3rd Qu.:0.5556     3rd Qu.:0.8389                      
##  Max.   :1.0000     Max.   :1.0000     Max.   :1.0000

#### Preparazione (norm e log) Validation ####

validation_log <- validation %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                        Minimum.Orbit.Intersection,Hazardous) %>%
  mutate(Minimum.Orbit.Intersection=log(Minimum.Orbit.Intersection))
table(validation_log$Hazardous)

## 
##    0    1 
## 1107  206

summary(validation_log)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :11.16      Min.   :0.000      Min.   :-11.1140           0:1107   
##  1st Qu.:20.00      1st Qu.:0.000      1st Qu.: -4.2013           1: 206   
##  Median :22.00      Median :4.000      Median : -3.1249                    
##  Mean   :22.30      Mean   :3.599      Mean   : -3.3772                    
##  3rd Qu.:24.70      3rd Qu.:7.000      3rd Qu.: -2.1242                    
##  Max.   :30.80      Max.   :9.000      Max.   : -0.7864

# normalizzazione
for (i in 1:(dim(validation_log)[2]-1)){
  validation_log[, i] <- (validation_log[, i] - matrix_indicators[i, "min"])/(matrix_indicators[i, "max"]- matrix_indicators[i, "min"])
}

summary(validation_log)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :-0.1518    Min.   :0.0000     Min.   :0.1601             0:1107   
##  1st Qu.: 0.3344    1st Qu.:0.0000     1st Qu.:0.7197             1: 206   
##  Median : 0.4444    Median :0.4444     Median :0.8068                      
##  Mean   : 0.4607    Mean   :0.3999     Mean   :0.7864                      
##  3rd Qu.: 0.5930    3rd Qu.:0.7778     3rd Qu.:0.8878                      
##  Max.   : 0.9285    Max.   :1.0000     Max.   :0.9961

LDA QDA

# Funzione per Q-Q plot per singola variabile
qq_plot <- function(data, var) {
  ggplot(data, aes(sample = !!sym(var), color = Hazardous)) + 
    geom_qq_line(color="black", linewidth=0.8) +
    geom_qq() + 
    facet_wrap(~Hazardous) +
    labs(title = paste("Q-Q Plot di", var), size = 2) +
    labs(x = "", y = "") +
    scale_color_manual(values = c("green2","purple"), name = "Esito", labels = c("Non-Hazardous",  "Hazardous"))
}

# Q-Q plot per le diverse variabili
plots_list <- lapply(names(final_sub_trn)[1:ncol(final_sub_trn)-1], function(var) qq_plot(final_sub_trn, var))

# Unione dei Q-Q plot
grid.arrange(grobs = plots_list, ncol = 2)

# Variabili di supporto
pvalue_shapiro <- matrix(0, nrow = (dim(final_sub_trn)[2]-1), ncol = 2)
rownames(pvalue_shapiro) = colnames(final_sub_trn)[-4]
colnames(pvalue_shapiro) = c("1", "0")

# Test Shapiro e costruzione di una matrice riassuntiva con i p-value condizionati alla classe
for (i in colnames(final_sub_trn)[-4]){
  pvalue_shapiro[i, "1"] <- shapiro.test(final_sub_trn[final_sub_trn$Hazardous == "1", i])$p.value
  pvalue_shapiro[i, "0"] <- shapiro.test(final_sub_trn[final_sub_trn$Hazardous == "0", i])$p.value
}
round(pvalue_shapiro, 10)

##                            1        0
## Absolute.Magnitude         0 3.78e-08
## Orbit.Uncertainity         0 0.00e+00
## Minimum.Orbit.Intersection 0 0.00e+00

REGRESSIONE LOGISTICA

model_logit<-glm(Hazardous~., data=final_sub_trn, family=binomial)
summary(model_logit)#tutte significative tranne relative velocity per km e eccentricity

## 
## Call:
## glm(formula = Hazardous ~ ., family = binomial, data = final_sub_trn)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 21.3387     0.7245  29.454  < 2e-16 ***
## Absolute.Magnitude         -16.9947     0.6719 -25.295  < 2e-16 ***
## Orbit.Uncertainity          -1.3040     0.1906  -6.842 7.81e-12 ***
## Minimum.Orbit.Intersection -18.1837     0.6601 -27.546  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5672.7  on 4091  degrees of freedom
## Residual deviance: 2958.0  on 4088  degrees of freedom
## AIC: 2966
## 
## Number of Fisher Scoring iterations: 6

influencePlot(model_logit)

##        StudRes          Hat       CookD
## 315  -1.137165 1.002951e-02 0.002303363
## 746  -1.406131 1.998944e-02 0.008548113
## 1586 -4.401998 1.169127e-05 0.043251744
## 1928 -3.954934 4.575701e-05 0.026996968

final_sub_trn2<-final_sub_trn[-c(746,315,1586,1928), ]

model_logit2<-glm(Hazardous~., data=final_sub_trn2, family='binomial')
summary(model_logit2) #AIC MIGLIORATO

## 
## Call:
## glm(formula = Hazardous ~ ., family = "binomial", data = final_sub_trn2)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 21.8868     0.7423  29.486  < 2e-16 ***
## Absolute.Magnitude         -17.2263     0.6812 -25.289  < 2e-16 ***
## Orbit.Uncertainity          -1.3087     0.1921  -6.811 9.68e-12 ***
## Minimum.Orbit.Intersection -18.7563     0.6786 -27.639  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5667.2  on 4087  degrees of freedom
## Residual deviance: 2919.1  on 4084  degrees of freedom
## AIC: 2927.1
## 
## Number of Fisher Scoring iterations: 6

probabilities <- predict(model_logit2, type = "response")
predictors <- c("Absolute.Magnitude", "Orbit.Uncertainity", "Minimum.Orbit.Intersection")

# Costruzione delle log(p1/(1-p1))
supp <- final_sub_trn2[, -c(4)]
supp <- supp %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

# Costruzione dei grafici
ggplot(supp, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

predAIC<-ifelse(predict(model_logit2, validation_log, type='response')>0.5, 1, 0)
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 890  25
##          1 217 181
##                                           
##                Accuracy : 0.8157          
##                  95% CI : (0.7936, 0.8363)
##     No Information Rate : 0.8431          
##     P-Value [Acc > NIR] : 0.9967          
##                                           
##                   Kappa : 0.4949          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8786          
##             Specificity : 0.8040          
##          Pos Pred Value : 0.4548          
##          Neg Pred Value : 0.9727          
##              Prevalence : 0.1569          
##          Detection Rate : 0.1379          
##    Detection Prevalence : 0.3031          
##       Balanced Accuracy : 0.8413          
##                                           
##        'Positive' Class : 1               
##

#F1 score
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')$byClass[7]

##        F1 
## 0.5993377

pred_roclogit <- prediction(predAIC, validation$Hazardous)
perf_logit <- performance(pred_roclogit,"tpr","fpr")
auc_logit <- performance(pred_roclogit, measure = "auc")@y.values
auc_logit

## [[1]]
## [1] 0.8413077

KNN

final_sub_trn_KNN <- sub_training %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                             Minimum.Orbit.Intersection,Hazardous)

#### Normalizzazione Subtraining (per KNN senza trasformazioni effettuate) ####

matrix_indicators_KNN <- matrix(0, nrow=(dim(final_sub_trn_KNN)[2]-1), ncol=2) 
colnames(matrix_indicators_KNN) <- c("min", "max")

# calcolo min e max.
for (i in 1:(dim(final_sub_trn_KNN)[2]-1)){
  matrix_indicators_KNN[i, "min"] <- min(final_sub_trn_KNN[,i])
  matrix_indicators_KNN[i, "max"] <- max(final_sub_trn_KNN[,i])
}

# normalizzazione
for (i in 1:(dim(final_sub_trn_KNN)[2]-1)){
  final_sub_trn_KNN[, i] <- (final_sub_trn_KNN[, i] - matrix_indicators_KNN[i, "min"])/(matrix_indicators_KNN[i, "max"]- matrix_indicators_KNN[i, "min"])
}

summary(final_sub_trn_KNN)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :0.0000     Min.   :0.0000     Min.   :0.00000            0:2046   
##  1st Qu.:0.3179     1st Qu.:0.0000     1st Qu.:0.02711            1:2046   
##  Median :0.3949     Median :0.1111     Median :0.06422                     
##  Mean   :0.4134     Mean   :0.2878     Mean   :0.12322                     
##  3rd Qu.:0.4829     3rd Qu.:0.5556     3rd Qu.:0.13670                     
##  Max.   :1.0000     Max.   :1.0000     Max.   :1.00000

#### Preparazione (e stand) Validation KNN ####

validation_KNN <- validation %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                        Minimum.Orbit.Intersection,Hazardous)
table(validation_KNN$Hazardous)

## 
##    0    1 
## 1107  206

summary(validation_KNN)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :11.16      Min.   :0.000      Min.   :0.0000149          0:1107   
##  1st Qu.:20.00      1st Qu.:0.000      1st Qu.:0.0149764          1: 206   
##  Median :22.00      Median :4.000      Median :0.0439429                   
##  Mean   :22.30      Mean   :3.599      Mean   :0.0799857                   
##  3rd Qu.:24.70      3rd Qu.:7.000      3rd Qu.:0.1195290                   
##  Max.   :30.80      Max.   :9.000      Max.   :0.4554590

# normalizzazione
for (i in 1:(dim(validation_KNN)[2]-1)){
  validation_KNN[, i] <- (validation_KNN[, i] - matrix_indicators_KNN[i, "min"])/(matrix_indicators_KNN[i, "max"]- matrix_indicators_KNN[i, "min"])
}
summary(validation_KNN)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :-0.1518    Min.   :0.0000     Min.   :0.0000269          0:1107   
##  1st Qu.: 0.3344    1st Qu.:0.0000     1st Qu.:0.0313343          1: 206   
##  Median : 0.4444    Median :0.4444     Median :0.0919478                   
##  Mean   : 0.4607    Mean   :0.3999     Mean   :0.1673688                   
##  3rd Qu.: 0.5930    3rd Qu.:0.7778     3rd Qu.:0.2501145                   
##  Max.   : 0.9285    Max.   :1.0000     Max.   :0.9530602

calc_class_err = function(actual, predicted) { mean(actual != predicted) }

## Valutazione del validation error al variare di k
## Con questo approccio posso definire anche il k ottimale da utilizzare poi nel test set
set.seed(123)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
for (i in seq_along(k_to_try)) {
  pred = knn(train = final_sub_trn_KNN[,-4],
             test = validation_KNN[,-4], cl = final_sub_trn_KNN[,4],
             k = k_to_try[i])
  err_k[i] = calc_class_err(validation_KNN[,4], pred) }


## test error Vs k
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20,
     xlab = "k, number of neighbors", ylab = "classification error", 
     main = "(Test) Error Rate vs Neighbors")

## Errore minimo
min(err_k)

## [1] 0.01827875

## Valori di k che danno l'errore minimo
which(err_k == min(err_k))

## [1] 1

err_filtr <- err_k[err_k > 0.02]
posizione <- which.min(err_filtr)
best_k <- which(err_k == err_filtr[posizione])
best_k

## [1] 3

err_k[best_k]

## [1] 0.02665651

PREVISIONE REGR LOG E KNN SU VALIDATION

predAIC<-ifelse(predict(model_logit2, validation_log, type='response')>0.5, 1, 0)
confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 890  25
##          1 217 181
##                                           
##                Accuracy : 0.8157          
##                  95% CI : (0.7936, 0.8363)
##     No Information Rate : 0.8431          
##     P-Value [Acc > NIR] : 0.9967          
##                                           
##                   Kappa : 0.4949          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8786          
##             Specificity : 0.8040          
##          Pos Pred Value : 0.4548          
##          Neg Pred Value : 0.9727          
##              Prevalence : 0.1569          
##          Detection Rate : 0.1379          
##    Detection Prevalence : 0.3031          
##       Balanced Accuracy : 0.8413          
##                                           
##        'Positive' Class : 1               
##

pred <-  knn(train = final_sub_trn_KNN[,-4], test = validation_KNN[,-4], cl = final_sub_trn_KNN[,4],k = 3, prob = TRUE)
confusionMatrix(as.factor(pred),as.factor(validation_KNN$Hazardous), positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1074    2
##          1   33  204
##                                           
##                Accuracy : 0.9733          
##                  95% CI : (0.9631, 0.9814)
##     No Information Rate : 0.8431          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9051          
##                                           
##  Mcnemar's Test P-Value : 3.959e-07       
##                                           
##             Sensitivity : 0.9903          
##             Specificity : 0.9702          
##          Pos Pred Value : 0.8608          
##          Neg Pred Value : 0.9981          
##              Prevalence : 0.1569          
##          Detection Rate : 0.1554          
##    Detection Prevalence : 0.1805          
##       Balanced Accuracy : 0.9802          
##                                           
##        'Positive' Class : 1               
##

# F1 score regr logistica
F1_regr_log <-confusionMatrix(as.factor(predAIC), validation_log[, 4], positive='1')$byClass[7]
F1_regr_log

##        F1 
## 0.5993377

# F1 score KNN
F1_KNN <- confusionMatrix(as.factor(pred),as.factor(validation_KNN$Hazardous), positive="1")$byClass[7]
F1_KNN

##        F1 
## 0.9209932

pred_roclogit <- prediction(predAIC, validation$Hazardous)
perf_logit <- performance(pred_roclogit,"tpr","fpr")
auc_logit <- performance(pred_roclogit, measure = "auc")@y.values
auc_logit

## [[1]]
## [1] 0.8413077

prob_knn_val <- attr(pred, "prob")
prob_knn_val <- 2*ifelse(pred == "0", 1-prob_knn_val, prob_knn_val) -1
pred_rocknn_val <- prediction(prob_knn_val, validation_KNN[, 4])
pred_knn_val <- performance(pred_rocknn_val, "tpr", "fpr")
auc_knn_val <- performance(pred_rocknn_val, measure = "auc")@y.values
auc_knn_val

## [[1]]
## [1] 0.9891314

par(mfrow=c(1,2))
plot(perf_logit, colorize=T, lwd=3, main="Regressione Logistica su Validation", cex.main = 1) ; abline(a=0, b=1, lty=2, col="gray")
plot(pred_knn_val, colorize=T, lwd=3, main="KNN su Validation", cex.main = 1); abline(a=0, b=1, lty=2, col="gray")

PARTE TRAINING SU TEST

KNN

training_KNN <- nasa_trn %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                    Minimum.Orbit.Intersection,Hazardous)

# summary(training_KNN)

set.seed(123)
training_KNN <- upSample(training_KNN[,-4], training_KNN[,4], yname="Hazardous")
table(training_KNN$Hazardous)

## 
##    0    1 
## 3153 3153

# Normalizzazione 
matrix_indicators_TRN_KNN <- matrix(0, nrow=(dim(training_KNN)[2]-1), ncol=2) 
colnames(matrix_indicators_TRN_KNN) <- c("min", "max")

# calcolo min e max.
for (i in 1:(dim(training_KNN)[2]-1)){
  matrix_indicators_TRN_KNN[i, "min"] <- min(training_KNN[,i])
  matrix_indicators_TRN_KNN[i, "max"] <- max(training_KNN[,i])
}

# normalizzazione
for (i in 1:(dim(training_KNN)[2]-1)){
  training_KNN[, i] <- (training_KNN[, i] - matrix_indicators_TRN_KNN[i, "min"])/(matrix_indicators_TRN_KNN[i, "max"]- matrix_indicators_TRN_KNN[i, "min"])
}

summary(training_KNN)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :0.0000     Min.   :0.0000     Min.   :0.00000            0:3153   
##  1st Qu.:0.4031     1st Qu.:0.0000     1st Qu.:0.02717            1:3153   
##  Median :0.4699     Median :0.1111     Median :0.06108                     
##  Mean   :0.4884     Mean   :0.2939     Mean   :0.12154                     
##  3rd Qu.:0.5511     3rd Qu.:0.6667     3rd Qu.:0.13355                     
##  Max.   :1.0000     Max.   :1.0000     Max.   :1.00000

test_KNN <- nasa_tst %>% select(Absolute.Magnitude, Orbit.Uncertainity,
                                Minimum.Orbit.Intersection,Hazardous)

# summary(test_KNN)

# normalizzazione
for (i in 1:(dim(test_KNN)[2]-1)){
  test_KNN[, i] <- (test_KNN[, i] - matrix_indicators_TRN_KNN[i, "min"])/(matrix_indicators_TRN_KNN[i, "max"]- matrix_indicators_TRN_KNN[i, "min"])
}

summary(test_KNN)

##  Absolute.Magnitude Orbit.Uncertainity Minimum.Orbit.Intersection Hazardous
##  Min.   :0.1595     Min.   :0.0000     Min.   :0.00000            0:779    
##  1st Qu.:0.4317     1st Qu.:0.0000     1st Qu.:0.02904            1:159    
##  Median :0.5081     Median :0.3333     Median :0.09748                     
##  Mean   :0.5272     Mean   :0.3753     Mean   :0.17472                     
##  3rd Qu.:0.6323     3rd Qu.:0.6667     3rd Qu.:0.26308                     
##  Max.   :0.8997     Max.   :1.0000     Max.   :0.95181

set.seed(123)
model <- knn(training_KNN[, -4], test_KNN[, -4], cl=training_KNN[, 4], k=3, prob=TRUE)

ALBERI DECISIONALI

set.seed (123)

tree.nasa.trn <- tree(Hazardous ~. , nasa_trn)

plot(tree.nasa.trn) ; text(tree.nasa.trn, pretty = 0) ; title(main = "Albero di classificazione non potato")

tree.pred1 <- predict(tree.nasa.trn , nasa_tst , type = "class")
confusionMatrix(tree.pred1, nasa_tst$Hazardous, positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 775   2
##          1   4 157
##                                           
##                Accuracy : 0.9936          
##                  95% CI : (0.9861, 0.9976)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9774          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9874          
##             Specificity : 0.9949          
##          Pos Pred Value : 0.9752          
##          Neg Pred Value : 0.9974          
##              Prevalence : 0.1695          
##          Detection Rate : 0.1674          
##    Detection Prevalence : 0.1716          
##       Balanced Accuracy : 0.9911          
##                                           
##        'Positive' Class : 1               
##

#PRUNING
set.seed (123)
cv.nasa <- cv.tree(tree.nasa.trn , FUN = prune.misclass)
#SCELGO LA SIZE MIGLIORE
par(mfrow = c(1, 2))
plot(cv.nasa$size , cv.nasa$dev, type = "b")
plot(cv.nasa$k, cv.nasa$dev, type = "b")

par(mfrow= c(1,1))

prune.nasa <- prune.misclass(tree.nasa.trn , best = 3)
#GRAFICO POTATO
plot(prune.nasa) ; text(prune.nasa , pretty = 0) ; title(main = "Albero di classificazione potato")

RANDOM FOREST

set.seed(123)
rf_trn_1 <- randomForest(Hazardous ~ ., nasa_trn, importance = T, proximity = T, ntree = 500)
rf_trn_1

## 
## Call:
##  randomForest(formula = Hazardous ~ ., data = nasa_trn, importance = T,      proximity = T, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 0.37%
## Confusion matrix:
##      0   1 class.error
## 0 3146   7 0.002220108
## 1    7 589 0.011744966

plot(rf_trn_1)

prova <- rf_trn_1$err.rate
which.min(rf_trn_1$err.rate[,3])

## [1] 74

set.seed(123)
rf_trn_2 <- randomForest(Hazardous ~ ., nasa_trn, importance = T, proximity = T, ntree = 74)
rf_trn_2

## 
## Call:
##  randomForest(formula = Hazardous ~ ., data = nasa_trn, importance = T,      proximity = T, ntree = 74) 
##                Type of random forest: classification
##                      Number of trees: 74
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 0.35%
## Confusion matrix:
##      0   1 class.error
## 0 3145   8 0.002537266
## 1    5 591 0.008389262

varImpPlot(rf_trn_2, 
           main = "Importanza delle variabili per la distinzione tra Hazardous e Non-Hazardous")

PREVISIONE SU TEST

KNN

confusionMatrix(data=model,reference=factor(test_KNN[, 4]), positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 763   0
##          1  16 159
##                                           
##                Accuracy : 0.9829          
##                  95% CI : (0.9724, 0.9902)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9417          
##                                           
##  Mcnemar's Test P-Value : 0.0001768       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9795          
##          Pos Pred Value : 0.9086          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1695          
##          Detection Rate : 0.1695          
##    Detection Prevalence : 0.1866          
##       Balanced Accuracy : 0.9897          
##                                           
##        'Positive' Class : 1               
##

ALBERI DECISIONALI

tree.pred2 <- predict(prune.nasa , nasa_tst , type = "class")

confusionMatrix(tree.pred2, nasa_tst$Hazardous, positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 775   2
##          1   4 157
##                                           
##                Accuracy : 0.9936          
##                  95% CI : (0.9861, 0.9976)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9774          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9874          
##             Specificity : 0.9949          
##          Pos Pred Value : 0.9752          
##          Neg Pred Value : 0.9974          
##              Prevalence : 0.1695          
##          Detection Rate : 0.1674          
##    Detection Prevalence : 0.1716          
##       Balanced Accuracy : 0.9911          
##                                           
##        'Positive' Class : 1               
##

RANDOM FOREST

set.seed(123)
rf_pred <- predict(rf_trn_2, nasa_tst) %>%
  as.data.frame() %>%
  mutate(Hazardous = as.factor(`.`)) %>%
  select(Hazardous)

confusionMatrix(rf_pred$Hazardous, nasa_tst$Hazardous, positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 777   0
##          1   2 159
##                                           
##                Accuracy : 0.9979          
##                  95% CI : (0.9923, 0.9997)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9925          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9974          
##          Pos Pred Value : 0.9876          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1695          
##          Detection Rate : 0.1695          
##    Detection Prevalence : 0.1716          
##       Balanced Accuracy : 0.9987          
##                                           
##        'Positive' Class : 1               
##

# VISUALIZATION POINTS

rf_class <- data.frame(actual = nasa_tst$Hazardous,
                       predicted = rf_pred$Hazardous) %>%
  mutate(Status = ifelse(actual == predicted, "CORRETTI" , "SBAGLIATI"))


ggplot(data = rf_class, mapping = aes(x = predicted, y = actual, 
                     color = Status, shape = Status)) +
  scale_color_manual(values = c("green2","red2")) +
  geom_jitter(size = 2, alpha = 0.8) +
  labs(x = "Esito previsto",
       y = "Esito reale", title= "Previsioni con modello RandomForest (tree = 40)") +
  theme_bw()

# F1 score KNN
F1_KNN <- confusionMatrix(data=model,reference=factor(test_KNN[, 4]), positive="1")$byClass[7]
F1_KNN

##        F1 
## 0.9520958

F1_DT <- confusionMatrix(tree.pred2, nasa_tst$Hazardous, positive="1")$byClass[7]
F1_DT

##      F1 
## 0.98125

F1_RF <- confusionMatrix(rf_pred$Hazardous, nasa_tst$Hazardous, positive="1")$byClass[7]
F1_RF

##      F1 
## 0.99375

prob_knn <- attr(model, "prob")
prob_knn <- 2*ifelse(model == "0", 1-prob_knn, prob_knn) -1
pred_rocknn <- prediction(prob_knn, test_KNN[, 4])
pred_knn <- performance(pred_rocknn, "tpr", "fpr")
auc_knn <- performance(pred_rocknn, measure = "auc")@y.values
auc_knn

## [[1]]
## [1] 0.9965324

tree.preds <- predict(prune.nasa, nasa_tst, type="vector")[,"1"]
pred_roctree <- prediction(tree.preds, nasa_tst$Hazardous)
pred_tree <- performance(pred_roctree, "tpr", "fpr")
auc_tree <- performance(pred_roctree, measure = "auc")@y.values
auc_tree

## [[1]]
## [1] 0.9948168

pred_rocRF <- predict(rf_trn_2, newdata=nasa_tst, type="prob")
pred_rocRF <- pred_rocRF[,"1"]
pred_rocRF <- prediction(pred_rocRF, nasa_tst$Hazardous)
pred_RF <- performance(pred_rocRF, "tpr", "fpr")
auc_RF <- performance(pred_rocRF, measure = "auc")@y.values
auc_RF

## [[1]]
## [1] 0.9999677

par(mfrow=c(1,3))
plot(pred_knn, colorize=T, lwd=3) ; abline(a=0, b=1, lty=2, col="gray") ; title(main = "Curva di ROC per KNN", sub = paste("AUC equal to:", round(auc_knn[[1]],3)), cex.sub = 1.4)
plot(pred_tree, colorize=T, lwd=3, main="Curva di ROC per Alberi Decisionali"); abline(a=0, b=1, lty=2, col="gray"); title(sub = paste("AUC pari a:", round(auc_tree[[1]],3)), cex.sub = 1.4)
plot(pred_RF, colorize=T, lwd=3, main="Curva di ROC per RF"); abline(a=0, b=1, lty=2, col="gray"); title(sub = paste("AUC pari a:", round(auc_RF[[1]],5)), cex.sub = 1.4)

Homework DataMining

Francesca Verna, Edoardo Olivieri

2024-04-12

R Markdown