Ekonometri_Final_Proje

REGRESYON ANALİZİ

Regresyon modeli, bağımsız (girdi) değişkenlerle bağımlı (çıktı) değişkenler arasındaki ilişkiyi modellemek için kullanılır. Regresyon analizinin temel amacı, bu ilişkiyi tanımlayan bir fonksiyon bulmak ve bu fonksiyonu kullanarak yeni veriler için tahminler yapmaktır.

# Gerekli kütüphaneleri yükleme
library(readr)
library(dplyr)
library(corrplot)
library(caTools)

train <- read_csv("train_c.csv")

## Rows: 8693 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Destination, deck, side
## dbl (8): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, withgroup, ...
## lgl (3): CryoSleep, VIP, Transported
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

test <- read_csv("test_c.csv")

## Rows: 4277 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Destination, deck, side
## dbl (8): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, withgroup, ...
## lgl (2): CryoSleep, VIP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

train$HomePlanet <- as.factor(train$HomePlanet)
train$Destination <- as.factor(train$Destination)
train$deck <- as.factor(train$deck)
train$side <- as.factor(train$side)

test$HomePlanet <- as.factor(test$HomePlanet)
test$Destination <- as.factor(test$Destination)
test$deck <- as.factor(test$deck)
test$side <- as.factor(test$side)

D <- train[,2:16] %>% mutate(across(everything(), ~as.integer(.)))

kor <- cor(D)

corrplot.mixed(kor)

model <- lm(Transported ~ ., data = train[, 2:16])

summary(model)

## 
## Call:
## lm(formula = Transported ~ ., data = train[, 2:16])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.42529 -0.31010 -0.02972  0.28890  1.79353 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.015e-01  4.015e-02   7.509 6.55e-14 ***
## HomePlanetEuropa          2.255e-01  2.866e-02   7.869 4.02e-15 ***
## HomePlanetMars            9.879e-02  1.478e-02   6.684 2.46e-11 ***
## CryoSleepTRUE             3.791e-01  1.154e-02  32.862  < 2e-16 ***
## DestinationPSO J318.5-22 -4.128e-02  1.807e-02  -2.285 0.022343 *  
## DestinationTRAPPIST-1e   -4.695e-02  1.131e-02  -4.150 3.35e-05 ***
## Age                      -2.170e-03  3.201e-04  -6.781 1.28e-11 ***
## VIPTRUE                  -3.835e-02  2.974e-02  -1.290 0.197237    
## RoomService              -1.179e-04  7.055e-06 -16.713  < 2e-16 ***
## FoodCourt                 4.263e-05  3.055e-06  13.953  < 2e-16 ***
## ShoppingMall              7.946e-05  7.457e-06  10.655  < 2e-16 ***
## Spa                      -8.693e-05  4.111e-06 -21.144  < 2e-16 ***
## VRDeck                   -8.289e-05  4.115e-06 -20.144  < 2e-16 ***
## withgroup                 2.030e-02  9.286e-03   2.186 0.028819 *  
## deckB                     1.109e-01  2.874e-02   3.858 0.000115 ***
## deckC                     1.491e-01  2.900e-02   5.142 2.78e-07 ***
## deckD                     5.528e-02  3.472e-02   1.592 0.111351    
## deckE                     1.614e-02  3.611e-02   0.447 0.655048    
## deckF                     1.129e-01  3.702e-02   3.051 0.002291 ** 
## deckG                     6.925e-02  3.861e-02   1.793 0.072943 .  
## deckT                     6.696e-02  1.815e-01   0.369 0.712284    
## sideS                     8.501e-02  8.630e-03   9.851  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4012 on 8671 degrees of freedom
## Multiple R-squared:  0.3576, Adjusted R-squared:  0.3561 
## F-statistic: 229.9 on 21 and 8671 DF,  p-value: < 2.2e-16

# Veriyi eğitim ve test seti olarak ayırma
set.seed(123)
split = sample.split(train$Transported, SplitRatio = 0.75)
train_train = subset(train, split == TRUE)
train_test = subset(train, split == FALSE)

# Model eğitimi
reg <- lm(Transported ~ ., data = train_train[, -c(1)])
summary(reg)

## 
## Call:
## lm(formula = Transported ~ ., data = train_train[, -c(1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50135 -0.31246 -0.02948  0.29157  1.77270 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.358e-01  4.638e-02   7.241 4.95e-13 ***
## HomePlanetEuropa          2.084e-01  3.341e-02   6.239 4.69e-10 ***
## HomePlanetMars            8.879e-02  1.703e-02   5.212 1.92e-07 ***
## CryoSleepTRUE             3.754e-01  1.329e-02  28.237  < 2e-16 ***
## DestinationPSO J318.5-22 -5.815e-02  2.081e-02  -2.795  0.00521 ** 
## DestinationTRAPPIST-1e   -5.058e-02  1.307e-02  -3.870  0.00011 ***
## Age                      -2.333e-03  3.723e-04  -6.266 3.95e-10 ***
## VIPTRUE                  -1.722e-02  3.384e-02  -0.509  0.61083    
## RoomService              -1.177e-04  7.788e-06 -15.113  < 2e-16 ***
## FoodCourt                 3.882e-05  3.468e-06  11.194  < 2e-16 ***
## ShoppingMall              8.269e-05  8.413e-06   9.829  < 2e-16 ***
## Spa                      -8.583e-05  4.707e-06 -18.232  < 2e-16 ***
## VRDeck                   -8.224e-05  4.775e-06 -17.221  < 2e-16 ***
## withgroup                 1.634e-02  1.075e-02   1.520  0.12866    
## deckB                     1.050e-01  3.289e-02   3.193  0.00142 ** 
## deckC                     1.448e-01  3.317e-02   4.365 1.29e-05 ***
## deckD                     3.583e-02  3.961e-02   0.905  0.36576    
## deckE                    -1.742e-02  4.196e-02  -0.415  0.67797    
## deckF                     9.287e-02  4.276e-02   2.172  0.02991 *  
## deckG                     5.093e-02  4.463e-02   1.141  0.25379    
## deckT                     5.756e-02  1.822e-01   0.316  0.75208    
## sideS                     8.875e-02  9.981e-03   8.892  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4012 on 6498 degrees of freedom
## Multiple R-squared:  0.3582, Adjusted R-squared:  0.3561 
## F-statistic: 172.7 on 21 and 6498 DF,  p-value: < 2.2e-16

# Tahmin yapma
reg_tahmin = predict(reg, newdata = train_test[, -c(1,12)])
reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)

# Gerçek değerlerin hazırlanması
transported_gercek <- ifelse(train_test$Transported == TRUE, 1, 0)
cm = table(transported_gercek, reg_transported_tahmin)

# Confusion matrix ve doğruluk hesaplama
cm

##                   reg_transported_tahmin
## transported_gercek   0   1
##                  0 903 176
##                  1 323 771

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.7703636

reg_tahmin = predict(reg, newdata = train_train[, -c(1,12)])

reg_transported_tahmin <- ifelse(reg_tahmin > 0.5, 1, 0)

transported_gercek <- ifelse(train_train[12] == TRUE, 1, 0)

cm = table(transported_gercek, reg_transported_tahmin)

cm

##                   reg_transported_tahmin
## transported_gercek    0    1
##                  0 2681  555
##                  1  949 2335

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.7693252

reg_tahmin_bd = predict(model, newdata = test[, -c(1)])

reg_transported_test_tahmin <- ifelse(reg_tahmin_bd > 0.5, TRUE, FALSE)

Transported <- as.character(reg_transported_test_tahmin)
PassengerId <- test$PassengerId

Transported <- as.vector(Transported)

submission_reg <- cbind(PassengerId, Transported)

submission_reg <- as.data.frame(submission_reg)

library(stringr)

submission_reg$Transported <- str_to_title(submission_reg$Transported)

write.csv(submission_reg, "regresyon_tahmini.csv", row.names = FALSE, quote = FALSE)

LOJİSTİK REGRESYON ANALİZİ

Spaceship Titanic verisinde ikili bir sınıflandırma problemi olduğunu varsayarsak, lojistik regresyon modelini kullanmak uygun olacaktır. Bu modelle, bir yolcunun hayatta kalıp kalmadığını (0 veya 1) tahmin edebiliriz.

train_log <- train %>%
  mutate_at(c(5, 7:11, 16), ~ log(1 + .))

test_log <- test %>%
  mutate_at(c(5, 7:11, 15), ~ log(1 + .))

modellog <- lm(Transported ~ 1 + HomePlanet + Destination + deck + side + CryoSleep + Age + RoomService + FoodCourt + ShoppingMall + Spa , data = train_log)

reg_tahmin_log = predict(modellog, newdata = test_log[, -c(1)])

reg_transported_test_tahmin_log <- ifelse(reg_tahmin_log > 0.5, TRUE, FALSE)

Transported <- as.character(reg_transported_test_tahmin_log)
PassengerId <- test$PassengerId

Transported <- as.vector(Transported)
submission_reg_log <- cbind(PassengerId, Transported)
submission_reg_log <- as.data.frame(submission_reg_log)
submission_reg_log$Transported <- str_to_title(submission_reg_log$Transported)
write.csv(submission_reg, "submission_reg_log.csv", row.names = FALSE, quote = FALSE)

logistic = glm(formula = Transported ~ .,
               family = binomial,
               data = train_train[, -c(1)])

summary(logistic)

## 
## Call:
## glm(formula = Transported ~ ., family = binomial, data = train_train[, 
##     -c(1)])
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -4.855e-01  3.587e-01  -1.353 0.175933    
## HomePlanetEuropa          1.570e+00  2.653e-01   5.918 3.26e-09 ***
## HomePlanetMars            5.361e-01  1.105e-01   4.849 1.24e-06 ***
## CryoSleepTRUE             1.276e+00  9.197e-02  13.873  < 2e-16 ***
## DestinationPSO J318.5-22 -4.923e-01  1.299e-01  -3.790 0.000151 ***
## DestinationTRAPPIST-1e   -4.611e-01  9.418e-02  -4.896 9.78e-07 ***
## Age                      -8.311e-03  2.460e-03  -3.379 0.000728 ***
## VIPTRUE                  -1.679e-01  2.924e-01  -0.574 0.565911    
## RoomService              -1.738e-03  1.135e-04 -15.313  < 2e-16 ***
## FoodCourt                 4.547e-04  4.453e-05  10.211  < 2e-16 ***
## ShoppingMall              5.275e-04  7.562e-05   6.976 3.04e-12 ***
## Spa                      -1.968e-03  1.166e-04 -16.878  < 2e-16 ***
## VRDeck                   -1.896e-03  1.156e-04 -16.400  < 2e-16 ***
## withgroup                 1.267e-01  7.183e-02   1.764 0.077800 .  
## deckB                     1.126e+00  2.899e-01   3.884 0.000103 ***
## deckC                     2.454e+00  3.315e-01   7.404 1.33e-13 ***
## deckD                     6.903e-01  3.245e-01   2.127 0.033389 *  
## deckE                     9.830e-02  3.352e-01   0.293 0.769339    
## deckF                     7.773e-01  3.397e-01   2.288 0.022127 *  
## deckG                     3.810e-01  3.490e-01   1.092 0.274904    
## deckT                    -1.275e-01  1.869e+00  -0.068 0.945583    
## sideS                     5.984e-01  6.663e-02   8.981  < 2e-16 ***
## expense                          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9038.3  on 6519  degrees of freedom
## Residual deviance: 5601.5  on 6498  degrees of freedom
## AIC: 5645.5
## 
## Number of Fisher Scoring iterations: 7

log_tahmin = predict(logistic, newdata = train_test[, -c(1,12)])

head(log_tahmin)

##           1           2           3           4           5           6 
##  -1.1063121 -11.0565547   0.9680020  -1.6266855  -0.4167855  -1.7011268

log_transported_tahmin <- ifelse(log_tahmin >0.5, 1, 0)

head(log_transported_tahmin)

## 1 2 3 4 5 6 
## 0 0 1 0 0 0

transported_gercek <- ifelse(train_test[12] == TRUE, 1, 0)

library(tidymodels)

result = data.frame(cbind(transported_gercek, log_transported_tahmin))

result$Transported <- as.factor(result$Transported)
result$log_transported_tahmin <- as.factor(result$log_transported_tahmin)

accuracy(result, truth = Transported, estimate = log_transported_tahmin)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.784

conf_mat(result, truth = Transported, estimate = log_transported_tahmin)

##           Truth
## Prediction   0   1
##          0 918 308
##          1 161 786

library(caret)

cm = table(transported_gercek, log_transported_tahmin)

cm

##                   log_transported_tahmin
## transported_gercek   0   1
##                  0 918 161
##                  1 308 786

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.7841694

confusionMatrix(as.factor(transported_gercek), as.factor(log_transported_tahmin))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 918 161
##          1 308 786
##                                           
##                Accuracy : 0.7842          
##                  95% CI : (0.7663, 0.8013)
##     No Information Rate : 0.5642          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5687          
##                                           
##  Mcnemar's Test P-Value : 1.566e-11       
##                                           
##             Sensitivity : 0.7488          
##             Specificity : 0.8300          
##          Pos Pred Value : 0.8508          
##          Neg Pred Value : 0.7185          
##              Prevalence : 0.5642          
##          Detection Rate : 0.4225          
##    Detection Prevalence : 0.4965          
##       Balanced Accuracy : 0.7894          
##                                           
##        'Positive' Class : 0               
##

logistic_bd = glm(formula = Transported ~ .,
               family = binomial,
               data = train[, -c(1)])

log_tahmin_bd = predict(logistic_bd, newdata = test[, -c(1)])

log_transported_test_tahmin <- ifelse(log_tahmin_bd > 0.5, TRUE, FALSE)

Transported <- as.character(log_transported_test_tahmin)
PassengerId <- test$PassengerId

Transported <- as.vector(Transported)
submission_logistic <- cbind(PassengerId, Transported)
submission_logistic <- as.data.frame(submission_logistic)
submission_logistic$Transported <- str_to_title(submission_logistic$Transported)
write.csv(submission_logistic, "submission_log.csv", row.names = FALSE, quote = FALSE)

logistic_bd = glm(formula = Transported ~ .,
               family = binomial,
               data = train_log[, -c(1)])

log_tahmin_bd = predict(logistic_bd, newdata = test_log[, -c(1)])

log_transported_test_tahmin <- ifelse(log_tahmin_bd > 0.5, TRUE, FALSE)

Transported <- as.character(log_transported_test_tahmin)
PassengerId <- test$PassengerId

Transported <- as.vector(Transported)
submission_llogistic <- cbind(PassengerId, Transported)
submission_llogistic <- as.data.frame(submission_llogistic)
submission_llogistic$Transported <- str_to_title(submission_llogistic$Transported)
write.csv(submission_logistic, "submission_llog.csv", row.names = FALSE, quote = FALSE)

NAİVE BAYES ANALİZİ

Naive Bayes, hızlı ve basit bir algoritmadır ve genellikle büyük veri setleri ve metin sınıflandırma problemleri için uygundur. R dilinde e1071 ve naivebayes gibi paketler, Naive Bayes modelini kolayca uygulamanıza ve analiz etmenize olanak tanır. Bu paketlerle çalışırken, verilerinizi uygun şekilde hazırlayıp modelinizi eğiterek, sınıflandırma problemlerinizde başarılı sonuçlar elde edebilirsiniz.

library(e1071)

fit_nb <- naiveBayes(Transported ~ ., data = train_train[, -1])

fit_nb

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    FALSE     TRUE 
## 0.496319 0.503681 
## 
## Conditional probabilities:
##        HomePlanet
## Y           Earth    Europa      Mars
##   FALSE 0.6220643 0.1755253 0.2024104
##   TRUE  0.4604141 0.3306943 0.2088916
## 
##        CryoSleep
## Y           FALSE      TRUE
##   FALSE 0.8702101 0.1297899
##   TRUE  0.4336175 0.5663825
## 
##        Destination
## Y       55 Cancri e PSO J318.5-22 TRAPPIST-1e
##   FALSE  0.16069221    0.09363412  0.74567367
##   TRUE   0.24969549    0.09043849  0.65986602
## 
##        Age
## Y           [,1]     [,2]
##   FALSE 30.01119 13.45225
##   TRUE  27.85293 14.87588
## 
##        VIP
## Y            FALSE       TRUE
##   FALSE 0.97126082 0.02873918
##   TRUE  0.98142509 0.01857491
## 
##        RoomService
## Y            [,1]     [,2]
##   FALSE 402.20365 916.6547
##   TRUE   56.93484 246.8105
## 
##        FoodCourt
## Y           [,1]     [,2]
##   FALSE 396.2923 1258.123
##   TRUE  514.8018 1918.620
## 
##        ShoppingMall
## Y           [,1]     [,2]
##   FALSE 160.2250 432.1103
##   TRUE  182.7135 748.5867
## 
##        Spa
## Y            [,1]      [,2]
##   FALSE 561.16193 1554.2636
##   TRUE   61.11571  264.0556
## 
##        VRDeck
## Y            [,1]      [,2]
##   FALSE 533.98733 1542.8703
##   TRUE   69.07186  295.0542
## 
##        withgroup
## Y            [,1]      [,2]
##   FALSE 0.3952410 0.4889780
##   TRUE  0.4960414 0.5000605
## 
##        deck
## Y                  A            B            C            D            E
##   FALSE 0.0296662546 0.0543881335 0.0574783684 0.0661310260 0.1344252163
##   TRUE  0.0310596833 0.1385505481 0.1199756395 0.0490255786 0.0676004872
##        deck
## Y                  F            G            T
##   FALSE 0.3637206428 0.2929542645 0.0012360939
##   TRUE  0.2828867235 0.3105968331 0.0003045067
## 
##        side
## Y               P         S
##   FALSE 0.5568603 0.4431397
##   TRUE  0.4448843 0.5551157
## 
##        expense
## Y            [,1]     [,2]
##   FALSE 2053.8702 3224.219
##   TRUE   884.6376 2307.962

library(magrittr)
library(dplyr)

pred_nb <- predict(fit_nb, newdata = train_test[, -c(1,12)], type = "raw") %>%
  data.frame()

head(pred_nb)

##       FALSE.         TRUE.
## 1 0.22290777  7.770922e-01
## 2 1.00000000 8.657607e-135
## 3 0.02738093  9.726191e-01
## 4 0.59333559  4.066644e-01
## 5 0.02412042  9.758796e-01
## 6 0.38532665  6.146734e-01

Transported_preb_nb = ifelse(pred_nb$TRUE. > 0.5, 1, 0)

head(Transported_preb_nb)

## [1] 1 0 1 0 1 1

Transported_train_test <- ifelse(train_test[12] == TRUE, 1, 0)

head(Transported_train_test)

##      Transported
## [1,]           1
## [2,]           0
## [3,]           1
## [4,]           0
## [5,]           0
## [6,]           0

cm = table(Transported_train_test, Transported_preb_nb)
cm

##                       Transported_preb_nb
## Transported_train_test    0    1
##                      0  481  598
##                      1   77 1017

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.6893695

nb = naiveBayes(Transported ~ ., data = train[, -1])

pred_nb <- predict(nb, newdata = test, type = "raw") %>%
  data.frame()

Transported_pred_nb = ifelse(pred_nb$TRUE. > 0.5, TRUE, FALSE)

library(stringr)

Transported <- as.character(Transported_pred_nb)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_nb.csv", row.names = FALSE, quote = FALSE)

SVM ANALİZİ

SVM (Destek Vektör Makineleri), hem sınıflandırma hem de regresyon görevlerinde kullanılan güçlü bir makine öğrenmesi algoritmasıdır. SVM, veri noktalarını, farklı sınıflara ayıran en iyi düzlemi (hiperdüzlem) bulmaya çalışır. Bu, iki sınıf arasındaki maksimum marjini (en uzak noktalar arasındaki mesafe) maksimize ederek yapılır.

fit_svm <- svm(Transported ~ ., data = train_train[, -1],
               type = 'C-classification',
               kernel= 'linear')

preds <- predict(fit_svm, newdata = train_test[, -c(1,12)], type = "raw") %>% 
  data.frame()

head(preds)

##       .
## 1 FALSE
## 2 FALSE
## 3  TRUE
## 4 FALSE
## 5  TRUE
## 6 FALSE

Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm

##                       Transported_pred_svm
## Transported_train_test   0   1
##                      0 831 248
##                      1 181 913

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.8025771

fit_svm <- svm(Transported ~ ., data = train[, -1],
               type = 'C-classification',
               kernel= 'linear')

preds <- predict(fit_svm, newdata = test, type = "raw") %>%
  data.frame()

Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)

Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm.csv", row.names = FALSE, quote = FALSE)

SVM RADİAL ANALİZİ

SVM’in temel prensibi, verileri sınıflandırmak için en iyi hiperdüzlemi bulmaktır. “Radial Basis Function” (RBF) veya radial çekirdek fonksiyonu, SVM’de sıkça kullanılan bir çekirdek fonksiyonudur ve verileri daha yüksek boyutlu bir uzaya dönüştürerek doğrusal olarak ayrılabilir hale getirir.

fit_svm <- svm(Transported ~ ., data = train_train[, -1],
               type = 'C-classification',
               kernel= 'radial')

preds <- predict(fit_svm, newdata = train_test[, - c(1,12)], type = "raw") %>%
  data.frame()

Transported_pred_svm = ifelse(preds$. == TRUE, 1, 0)
cm = table(Transported_train_test, Transported_pred_svm)
cm

##                       Transported_pred_svm
## Transported_train_test   0   1
##                      0 851 228
##                      1 204 890

fit_svm <- svm(Transported ~ ., data = train[, -1],
               type = 'C-classification',
               kernel= 'radial')

preds <- predict(fit_svm, newdata = test, type = "raw") %>%
  data.frame()

Transported_pred_svm = ifelse(preds$. == TRUE, TRUE, FALSE)

Transported <- as.character(Transported_pred_svm)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_svm_rad.csv", row.names = FALSE, quote = FALSE)

p <- ggplot(train_train, aes(x=HomePlanet, y=deck, color=factor(Transported))) +
  geom_point(aes(shape=factor(Transported)), size=3) +
  scale_color_viridis_d( ) +
  labs(title="", x="HomePlanet", y="deck") +
  theme_minimal() +
  theme(legend.position = "top")
p

DECISION TREES (KARAR AĞAÇLARI) ANALİZİ

Karar ağacı modeli, makine öğreniminde sınıflandırma ve regresyon problemlerini çözmek için kullanılan ve verileri belirli kurallara göre dallara ayırarak bir ağaç yapısı oluşturan bir algoritmadır. Bu model, verilerin özelliklerine göre bir dizi “if-then” kararları alarak sonuçlar üretir. Karar ağaçları, görselleştirme açısından da oldukça sezgisel olup, karar sürecini açıkça gösterir.

library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

fit_tree <- rpart::rpart(Transported ~ ., data = train_train[, -1])
summary(fit_tree)

## Call:
## rpart::rpart(formula = Transported ~ ., data = train_train[, 
##     -1])
##   n= 6520 
## 
##           CP nsplit rel error    xerror         xstd
## 1 0.23321359      0 1.0000000 1.0001811 0.0001977604
## 2 0.04905507      1 0.7667864 0.7671015 0.0103212807
## 3 0.02807107      2 0.7177313 0.7180892 0.0090936774
## 4 0.02784643      3 0.6896603 0.7039381 0.0095638265
## 5 0.01770479      4 0.6618138 0.6661699 0.0097007659
## 6 0.01381587      5 0.6441090 0.6496207 0.0100644975
## 7 0.01194576      6 0.6302932 0.6389139 0.0102048399
## 8 0.01164524      7 0.6183474 0.6310762 0.0102269497
## 9 0.01000000      8 0.6067022 0.6242632 0.0102752753
## 
## Variable importance
##      expense    CryoSleep    FoodCourt          Spa       VRDeck  RoomService 
##           23           17           13           12           11            9 
##         deck   HomePlanet ShoppingMall          Age  Destination 
##            6            5            2            1            1 
## 
## Node number 1: 6520 observations,    complexity param=0.2332136
##   mean=0.503681, MSE=0.2499865 
##   left son=2 (3795 obs) right son=3 (2725 obs)
##   Primary splits:
##       expense     < 0.5     to the right, improve=0.2332136, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.2095384, (0 missing)
##       RoomService < 0.5     to the right, improve=0.1254121, (0 missing)
##       Spa         < 0.5     to the right, improve=0.1142180, (0 missing)
##       VRDeck      < 0.5     to the right, improve=0.1084074, (0 missing)
##   Surrogate splits:
##       CryoSleep   < 0.5     to the left,  agree=0.932, adj=0.837, (0 split)
##       Spa         < 0.5     to the right, agree=0.784, adj=0.484, (0 split)
##       FoodCourt   < 0.5     to the right, agree=0.770, adj=0.451, (0 split)
##       VRDeck      < 0.5     to the right, agree=0.768, adj=0.444, (0 split)
##       RoomService < 0.5     to the right, agree=0.757, adj=0.419, (0 split)
## 
## Node number 2: 3795 observations,    complexity param=0.02807107
##   mean=0.2990777, MSE=0.2096302 
##   left son=4 (3243 obs) right son=5 (552 obs)
##   Primary splits:
##       FoodCourt    < 1331    to the left,  improve=0.05751186, (0 missing)
##       ShoppingMall < 627.5   to the left,  improve=0.04530804, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.04440387, (0 missing)
##       Spa          < 257.5   to the right, improve=0.03347453, (0 missing)
##       VRDeck       < 721     to the right, improve=0.02290457, (0 missing)
##   Surrogate splits:
##       expense    < 5981    to the left,  agree=0.885, adj=0.210, (0 split)
##       deck       splits as  RRRLLLLL,    agree=0.884, adj=0.205, (0 split)
##       HomePlanet splits as  LRL,         agree=0.878, adj=0.159, (0 split)
##       Spa        < 8955.5  to the left,  agree=0.856, adj=0.009, (0 split)
##       VRDeck     < 11692   to the left,  agree=0.856, adj=0.009, (0 split)
## 
## Node number 3: 2725 observations,    complexity param=0.04905507
##   mean=0.7886239, MSE=0.1666963 
##   left son=6 (1448 obs) right son=7 (1277 obs)
##   Primary splits:
##       deck        splits as  RRRRLRL-,    improve=0.17601740, (0 missing)
##       HomePlanet  splits as  LRR,         improve=0.12741890, (0 missing)
##       Destination splits as  RLL,         improve=0.02625136, (0 missing)
##       CryoSleep   < 0.5     to the left,  improve=0.02268236, (0 missing)
##       side        splits as  LR,          improve=0.01498707, (0 missing)
##   Surrogate splits:
##       HomePlanet  splits as  LRR,         agree=0.935, adj=0.861, (0 split)
##       Age         < 24.5    to the left,  agree=0.626, adj=0.201, (0 split)
##       Destination splits as  RLL,         agree=0.590, adj=0.126, (0 split)
##       withgroup   < 0.5     to the left,  agree=0.583, adj=0.110, (0 split)
##       VIP         < 0.5     to the left,  agree=0.538, adj=0.014, (0 split)
## 
## Node number 4: 3243 observations,    complexity param=0.02784643
##   mean=0.2537774, MSE=0.1893744 
##   left son=8 (2577 obs) right son=9 (666 obs)
##   Primary splits:
##       ShoppingMall < 541.5   to the left,  improve=0.07390355, (0 missing)
##       RoomService  < 365.5   to the right, improve=0.03464407, (0 missing)
##       Spa          < 240.5   to the right, improve=0.03327259, (0 missing)
##       VRDeck       < 114     to the right, improve=0.02784287, (0 missing)
##       expense      < 2867.5  to the right, improve=0.01811461, (0 missing)
##   Surrogate splits:
##       expense < 18644   to the left,  agree=0.795, adj=0.003, (0 split)
## 
## Node number 5: 552 observations,    complexity param=0.01770479
##   mean=0.5652174, MSE=0.2457467 
##   left son=10 (123 obs) right son=11 (429 obs)
##   Primary splits:
##       Spa       < 1372.5  to the right, improve=0.21272970, (0 missing)
##       VRDeck    < 1063.5  to the right, improve=0.17089500, (0 missing)
##       expense   < 5395    to the right, improve=0.06611166, (0 missing)
##       deck      splits as  LLRLLRRL,    improve=0.02845616, (0 missing)
##       FoodCourt < 2513    to the left,  improve=0.02780045, (0 missing)
##   Surrogate splits:
##       expense     < 12647   to the right, agree=0.790, adj=0.057, (0 split)
##       Age         < 13.5    to the left,  agree=0.779, adj=0.008, (0 split)
##       RoomService < 3895.5  to the right, agree=0.779, adj=0.008, (0 split)
## 
## Node number 6: 1448 observations
##   mean=0.6277624, MSE=0.2336768 
## 
## Node number 7: 1277 observations
##   mean=0.9710258, MSE=0.02813466 
## 
## Node number 8: 2577 observations,    complexity param=0.01194576
##   mean=0.193636, MSE=0.1561411 
##   left son=16 (2067 obs) right son=17 (510 obs)
##   Primary splits:
##       FoodCourt   < 456.5   to the left,  improve=0.04838893, (0 missing)
##       expense     < 1447.5  to the right, improve=0.04016842, (0 missing)
##       HomePlanet  splits as  RLL,         improve=0.02494631, (0 missing)
##       Spa         < 537.5   to the right, improve=0.01895890, (0 missing)
##       RoomService < 400.5   to the right, improve=0.01706521, (0 missing)
##   Surrogate splits:
##       expense < 12373   to the left,  agree=0.804, adj=0.008, (0 split)
##       Spa     < 13650   to the left,  agree=0.803, adj=0.006, (0 split)
##       VRDeck  < 10123.5 to the left,  agree=0.802, adj=0.002, (0 split)
##       deck    splits as  LLLLLLLR,    agree=0.802, adj=0.002, (0 split)
## 
## Node number 9: 666 observations
##   mean=0.4864865, MSE=0.2498174 
## 
## Node number 10: 123 observations
##   mean=0.1382114, MSE=0.119109 
## 
## Node number 11: 429 observations,    complexity param=0.01381587
##   mean=0.6876457, MSE=0.2147891 
##   left son=22 (143 obs) right son=23 (286 obs)
##   Primary splits:
##       VRDeck      < 611     to the right, improve=0.24438400, (0 missing)
##       Spa         < 225     to the right, improve=0.05300377, (0 missing)
##       FoodCourt   < 3119.5  to the left,  improve=0.05168044, (0 missing)
##       RoomService < 1719.5  to the right, improve=0.03930897, (0 missing)
##       side        splits as  LR,          improve=0.03734415, (0 missing)
##   Surrogate splits:
##       expense   < 6032    to the right, agree=0.702, adj=0.105, (0 split)
##       Age       < 53.5    to the right, agree=0.674, adj=0.021, (0 split)
##       FoodCourt < 12128.5 to the right, agree=0.671, adj=0.014, (0 split)
## 
## Node number 16: 2067 observations
##   mean=0.1504596, MSE=0.1278215 
## 
## Node number 17: 510 observations,    complexity param=0.01164524
##   mean=0.3686275, MSE=0.2327413 
##   left son=34 (204 obs) right son=35 (306 obs)
##   Primary splits:
##       expense    < 1447.5  to the right, improve=0.15990760, (0 missing)
##       VRDeck     < 86.5    to the right, improve=0.10060710, (0 missing)
##       HomePlanet splits as  RLL,         improve=0.07623828, (0 missing)
##       Spa        < 500     to the right, improve=0.07353369, (0 missing)
##       deck       splits as  LLLLRRRL,    improve=0.05075444, (0 missing)
##   Surrogate splits:
##       HomePlanet splits as  RLL,         agree=0.871, adj=0.676, (0 split)
##       deck       splits as  LLLLRRRL,    agree=0.839, adj=0.598, (0 split)
##       VRDeck     < 213.5   to the right, agree=0.818, adj=0.544, (0 split)
##       Spa        < 219.5   to the right, agree=0.792, adj=0.480, (0 split)
##       FoodCourt  < 907     to the right, agree=0.722, adj=0.304, (0 split)
## 
## Node number 22: 143 observations
##   mean=0.3636364, MSE=0.231405 
## 
## Node number 23: 286 observations
##   mean=0.8496503, MSE=0.1277446 
## 
## Node number 34: 204 observations
##   mean=0.1323529, MSE=0.1148356 
## 
## Node number 35: 306 observations
##   mean=0.5261438, MSE=0.2493165

rpart.plot(fit_tree)

preds <- predict(fit_tree, newdata = train_test[, - c(1,12)]) %>%
  data.frame()

head(preds)

##           .
## 1 0.1504596
## 2 0.1382114
## 3 0.8496503
## 4 0.1504596
## 5 0.4864865
## 6 0.1504596

Transported_pred_tree = ifelse(preds$.  >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_tree)
cm

##                       Transported_pred_tree
## Transported_train_test   0   1
##                      0 807 272
##                      1 237 857

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.7657616

fit_tree <- rpart(Transported ~ ., data = train[, -1])
            

preds <- predict(fit_tree, newdata = test) %>%
  data.frame()

Transported_pred_tree = ifelse(preds$. >0.5, TRUE, FALSE)

Transported <- as.character(Transported_pred_tree)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_tree.csv", row.names = FALSE, quote = FALSE)

RANDOM FOREST ANALİZİ

Random Forest, makine öğreniminde kullanılan, yüksek doğrulukta tahminler yapabilen güçlü bir sınıflandırma ve regresyon yöntemidir. Leo Breiman tarafından geliştirilen bu yöntem, birden fazla karar ağacının (decision trees) bir araya gelmesiyle oluşturulan bir ansambl yöntemidir. Her bir karar ağacı, veri setinin farklı bir rastgele alt kümesi üzerinde eğitilir ve sonuçta tahminler ağaçların çoğunluk oyu veya ortalaması alınarak yapılır.

library(rpart)

library(rpart.plot)

library(randomForest)

library(caret)

fit_forest <- randomForest(Transported ~ ., data = train_train[, -1])

fit_forest$importance

##              IncNodePurity
## HomePlanet        48.21166
## CryoSleep        102.47495
## Destination       25.77605
## Age               92.07261
## VIP                2.43084
## RoomService      124.63959
## FoodCourt        117.08030
## ShoppingMall      99.23254
## Spa              126.77091
## VRDeck           119.08002
## withgroup         12.85779
## deck              91.49524
## side              22.59922
## expense          248.44639

varImpPlot(fit_forest)

preds <- predict(fit_forest, newdata = train_test[,- c(1,12)]) %>%
  data.frame()

head(preds)

##           .
## 1 0.1049532
## 2 0.1112889
## 3 0.8129808
## 4 0.2158803
## 5 0.6832579
## 6 0.1744000

Transported_pred_forest = ifelse(preds$.  >0.5, 1, 0)
cm = table(Transported_train_test, Transported_pred_forest)
cm

##                       Transported_pred_forest
## Transported_train_test   0   1
##                      0 823 256
##                      1 175 919

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)
accuracy

## [1] 0.8016567

fit_forest <- randomForest(Transported ~ ., data = train[, -1])

preds <- predict(fit_forest, newdata = test) %>%
  data.frame()

Transported_pred_forest = ifelse(preds$. >0.5, TRUE, FALSE)

Transported <- as.character(Transported_pred_forest)
PassengerId <- test$PassengerId
Transported <- as.vector(Transported)
sample_submission <- cbind(PassengerId, Transported)
sample_submission <- as.data.frame(sample_submission)
sample_submission$Transported <- str_to_title(sample_submission$Transported)
write.csv(sample_submission, "sub_random_for.csv", row.names = FALSE, quote = FALSE)

SONUÇLAR

# Başarı oranlarını tanımla
regresyon_accuracy <- 0.77
lojistik_regresyon_accuracy <- 0.79
naive_bayes_accuracy <- 0.72
svm_accuracy <- 0.79
svm_radial_accuracy <- 0.80
decision_tree_accuracy <- 0.77
random_forest_accuracy <- 0.80

# Veri çerçevesini oluştur
dogruluk_performans <- data.frame(
  Modeller = c("Regresyon", "Lojistik Regresyon", "Naive Bayes", "SVM", "SVM Radial","Decision Trees", "Random Forest"),
  Dogruluk = c(regresyon_accuracy, lojistik_regresyon_accuracy, naive_bayes_accuracy, svm_accuracy, svm_radial_accuracy,decision_tree_accuracy, random_forest_accuracy)
)

ggplot(data = dogruluk_performans, aes(x = Modeller, y = Dogruluk)) +
  geom_bar(stat = "identity", fill = "blue", width = 0.3) +
  labs(title = "Farklı Modellerin Doğruluk Karşılaştırması", x = "Modeller", y = "Doğruluk Oranı") +
  theme(plot.title = element_text(hjust = 0.5))

# Veri çerçevesini oluştur
dogruluk_performanss <- data.frame(
  Modeller = c("Regresyon", "Lojistik Regresyon", "Naive Bayes", "SVM", "SVM Radial","Decision Trees", "Random Forest"),
  Dogruluk = c(regresyon_accuracy, lojistik_regresyon_accuracy, naive_bayes_accuracy, svm_accuracy, svm_radial_accuracy, decision_tree_accuracy, random_forest_accuracy)
)

# Nokta grafiği oluştur
ggplot(data = dogruluk_performanss, aes(x = Modeller, y = Dogruluk)) +
  geom_point(size = 4, color = "blue") +
  labs(title = "Farklı Modellerin Doğruluk Karşılaştırması", x = "Modeller", y = "Doğruluk Oranı") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_text(aes(label=Dogruluk), vjust=-1.5, size=3.5)

Farklı modelleri kullanarak araştırmamızda farklı sonuçlar aldık ve projemizin sonuna geldik. Yukarıda bütün modellerin nasıl kullanıldığı ve nasıl sonuç aldığımız gösteriliyor.