remove(list = ls())
train <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 7/train.csv")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# cor(train)
train_numeric <- select(.data = train, -Name, -Sex, -Ticket,-Cabin,-Embarked)
train_corr_matrix <- cor(train_numeric, use = "pairwise.complete.obs")
?cor
#install.packages(ggcorrplot)
library(ggcorrplot)
?ggcorrplot
ggcorrplot(corr = train_corr_matrix, type = "lower")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
train$Sex_Male <- as.numeric(train$Sex=="male")
stargazer(train, type = "text")
##
## ==============================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------
## PassengerId 891 446.000 257.354 1 891
## Survived 891 0.384 0.487 0 1
## Pclass 891 2.309 0.836 1 3
## Age 714 29.699 14.526 0.420 80.000
## SibSp 891 0.523 1.103 0 8
## Parch 891 0.382 0.806 0 6
## Fare 891 32.204 49.693 0.000 512.329
## Sex_Male 891 0.648 0.478 0 1
## ----------------------------------------------
set.seed(100)
?sample_n
train_subset <- sample_n(tbl = train,
size = 500)
test_subset <- dplyr::anti_join(x = train,
y = train_subset,
by = "PassengerId")
n=500
sample.model0 <-
lm(data = train_subset,
formula = Survived ~ as.factor(Pclass) + Sex + Age + I(Age^2) + Fare)
model1 <-
lm(data = train_subset,
formula = Survived ~ as.factor(Pclass) + Sex + Fare)
summary(model1)
##
## Call:
## lm(formula = Survived ~ as.factor(Pclass) + Sex + Fare, data = train_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8956 -0.2806 -0.1234 0.2446 0.8787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8534440 0.0616989 13.832 < 2e-16 ***
## as.factor(Pclass)2 -0.1038129 0.0631827 -1.643 0.101
## as.factor(Pclass)3 -0.2595314 0.0546242 -4.751 2.65e-06 ***
## Sexmale -0.4726555 0.0398321 -11.866 < 2e-16 ***
## Fare 0.0002781 0.0004890 0.569 0.570
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4073 on 495 degrees of freedom
## Multiple R-squared: 0.3013, Adjusted R-squared: 0.2957
## F-statistic: 53.38 on 4 and 495 DF, p-value: < 2.2e-16
What is on the rows vs what is on the columns? Example
What is event of interest (positive)? EG - positive is survived or died?
?predict
test_subset$prediction <- predict(object = model1, newdata = test_subset)
summary(test_subset$prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1213 0.1238 0.3808 0.4096 0.6003 0.9959
test_subset$predicted_Survived <- ifelse(test = test_subset$prediction > .5, yes = 1, no = 0)
confusion_matrix_hand <-
as.matrix(table(test_subset$Survived,
test_subset$predicted_Survived
))
confusion_matrix_hand
##
## 0 1
## 0 202 36
## 1 36 117
TP <- confusion_matrix_hand[2,2]
TN <- confusion_matrix_hand[1,1]
FP <- confusion_matrix_hand[2,1]
FN <- confusion_matrix_hand[1,2]
accuracy <- (TP+TN)/(TP+TN+FP+FN)
sentivity <- (TP) / (TP + FN)
specificty <- (TN) / (TN + FP)
(202+117) / (202+36+36+117) # accuracy
## [1] 0.8158568
117 / (117 + 36 ) # true positive rate / senstivity - TP/P = TP / (TP+FN)
## [1] 0.7647059
202 / (202 + 36) # true negative rate / specificity - TN/N = TN / (TN+FP)
## [1] 0.8487395
# install.packages("caret")
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
?confusionMatrix
## Help on topic 'confusionMatrix' was found in the following packages:
##
## Package Library
## ModelMetrics /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## caret /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
##
## Using the first match ...
caret::confusionMatrix(data = as.factor(test_subset$predicted_Survived),
reference = as.factor(test_subset$Survived),
positive = "1" # survived
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 202 36
## 1 36 117
##
## Accuracy : 0.8159
## 95% CI : (0.7738, 0.853)
## No Information Rate : 0.6087
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6134
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7647
## Specificity : 0.8487
## Pos Pred Value : 0.7647
## Neg Pred Value : 0.8487
## Prevalence : 0.3913
## Detection Rate : 0.2992
## Detection Prevalence : 0.3913
## Balanced Accuracy : 0.8067
##
## 'Positive' Class : 1
##