flight <- read_excel("C:/Users/justt/Desktop/School/622/Exams/flight_data.xls")
str(flight)
## tibble [400 × 2] (S3: tbl_df/tbl/data.frame)
## $ speed : num [1:400] 27.7 29.2 33.8 34.1 34.2 ...
## $ distance: num [1:400] 1324 1077 937 1198 956 ...
Y <- flight$distance
X <- flight$speed
set.seed(1234)
fbsp <- lm(Y~bs(X, df = 5), data = flight)
set.seed(1234)
fbsp_interval <- predict(fbsp, interval = "confidence", level = 0.70)
plot(X, Y, xlab = "Speed", main = "Regression Spline with df = 5", ylab = "Distance", cex = .5)
lines(X, fbsp$fitted.values, col = "blue", lwd = 1)
lines(X, fbsp_interval[,2], col = "red", lty = 2)
lines(X, fbsp_interval[,3], col = "red", lty = 2)
set.seed(1234)
fcsp = lm(Y~ns(X, df = 3), data = flight)
set.seed(1234)
anova(fbsp, fcsp)
## Analysis of Variance Table
##
## Model 1: Y ~ bs(X, df = 5)
## Model 2: Y ~ ns(X, df = 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 394 14851546
## 2 396 15155277 -2 -303730 4.0289 0.01853 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 1, b spline, has the smaller sum of squared errors at 14851546, opposed to Model 2, natural cubic spine, with a value of 15155277.
There is a statistically significant difference between the predictive performance of the two models as the P-value is 0.01853, which is less than 0.05.
The model that is best at predicting distance is the b spline model because the RSS is lower than that of the natural cubic model, and it also has an Adjusted R-squared of 0.9632, 96.32% which is a pretty large indicator that this is good at predicting the distance variable.
set.seed(1234)
health <- read.csv("C:/Users/justt/Desktop/School/622/Exams/health_dataset.csv")
str(health)
## 'data.frame': 2000 obs. of 20 variables:
## $ Gender : chr "female" "female" "female" "female" ...
## $ Age : int 49 45 45 45 66 54 50 16 56 56 ...
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
sample_index <- sample(nrow(health), nrow(health)*0.50)
health_train <- health[sample_index, ]
health_test <- health[-sample_index, ]
# decision tree
set.seed(1234)
health_train$Gender <- as.factor(health_train$Gender)
health_train$SleepTrouble <- as.factor(health_train$SleepTrouble)
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_test$Gender <- as.factor(health_test$Gender)
health_test$SleepTrouble <- as.factor(health_test$SleepTrouble)
dtree_train <- rpart(formula = health_train$Diabetes~., data = health_train, method = "class")
dtree_pred <- predict(dtree_train, type = "class")
table(health_test$Diabetes, dtree_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 878 33
## Yes 86 3
# random forest with 100 trees
set.seed(1234)
#I could have reloaded the Health data and then converted these records and then re-ran the train and test data set, but I cohose to not to because of the other models. I would have been better off doing this in the beginning. And I would if I had more time.
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_train$Gender <- as.factor(health_train$Gender)
health_train$SleepTrouble <- as.factor(health_train$SleepTrouble )
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_test$Gender <- as.factor(health_test$Gender)
health_test$SleepTrouble <- as.factor(health_test$SleepTrouble )
rf_train <- randomForest(Diabetes ~., data = health_train, importance = TRUE, ntree = 100)
rf_test_pred <- predict(rf_train, health_test)
table(health_test$Diabetes, rf_test_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 910 1
## Yes 66 23
# boosting model
set.seed(1234)
#I could have reloaded the Health data and then converted these records and then re-ran the train and test data set, but I cohose to not to because of the other models. I would have been better off doing this in the beginning. And I would if I had more time.
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_train$Gender <- as.factor(health_train$Gender)
health_train$SleepTrouble <- as.factor(health_train$SleepTrouble )
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_test$Gender <- as.factor(health_test$Gender)
health_test$SleepTrouble <- as.factor(health_test$SleepTrouble )
htboost <- boosting(Diabetes ~., data = health_train, boos = T)
htboost_pred <- predict(htboost, newdata = health_test)
htboost_pred$confusion
## Observed Class
## Predicted Class No Yes
## No 903 61
## Yes 8 28
Show all the confusion matrices for comparison
table(health_test$Diabetes, dtree_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 878 33
## Yes 86 3
table(health_test$Diabetes, rf_test_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 910 1
## Yes 66 23
htboost_pred$confusion
## Observed Class
## Predicted Class No Yes
## No 903 61
## Yes 8 28
health1 <- read.csv("C:/Users/justt/Desktop/School/622/Exams/health_dataset.csv")
str(health1)
## 'data.frame': 2000 obs. of 20 variables:
## $ Gender : chr "female" "female" "female" "female" ...
## $ Age : int 49 45 45 45 66 54 50 16 56 56 ...
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- health[,-1:-2]
str(healthnum)
## 'data.frame': 2000 obs. of 18 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-4:-12]
str(healthnum)
## 'data.frame': 2000 obs. of 9 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-6]
str(healthnum)
## 'data.frame': 2000 obs. of 8 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-8]
str(healthnum)
## 'data.frame': 2000 obs. of 7 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol: num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineFlow1: num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
sample_index <- sample(nrow(healthnum), nrow(healthnum)*0.50)
healthnum_train <- healthnum[sample_index, ]
healthnum_test <- healthnum[-sample_index, ]
knn_healthnum <- knn(train = healthnum_train[, -7], test = healthnum_test[, -7], cl = healthnum_train[, 7], k = 5)
table(healthnum_test[,7], knn_healthnum, dnn = c("True", "Predicted"))
## Predicted
## True No Yes
## No 903 18
## Yes 72 7
sum(healthnum_test[, 7] != knn_healthnum) #number misclassified
## [1] 90
table(health_test$Diabetes, dtree_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 878 33
## Yes 86 3
table(health_test$Diabetes, rf_test_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 910 1
## Yes 66 23
htboost_pred$confusion
## Observed Class
## Predicted Class No Yes
## No 903 61
## Yes 8 28
table(healthnum_test[,7], knn_healthnum, dnn = c("True", "Predicted"))
## Predicted
## True No Yes
## No 903 18
## Yes 72 7
set.seed(1234)
payments <- read.csv("C:/Users/justt/Desktop/School/622/Exams/default_payments.csv")
payments$LIMIT_BAL <- as.numeric(payments$LIMIT_BAL)
payments$PAY_0 <- as.numeric(payments$PAY_0)
payments$BILL_AMT1 <- as.numeric(payments$BILL_AMT1)
payments$PAY_AMT1 <- as.numeric(payments$PAY_AMT1)
sample_index <- sample(nrow(payments), nrow(payments)*0.80)
payments_train <- payments[sample_index,]
payments_test <- payments[-sample_index,]
str(payments)
## 'data.frame': 12000 obs. of 8 variables:
## $ LIMIT_BAL: num 290000 20000 280000 280000 20000 50000 20000 30000 200000 110000 ...
## $ SEX : chr "F" "M" "M" "F" ...
## $ MARRIAGE : chr "Currently Married" "Currently Married" "Currently Married" "Currently Married" ...
## $ AGE : int 26 51 29 47 24 26 23 25 38 29 ...
## $ PAY_0 : num 0 -1 -2 0 0 -2 1 1 -2 0 ...
## $ BILL_AMT1: num 18125 780 10660 269124 17924 ...
## $ PAY_AMT1 : num 3000 0 5123 11268 1400 ...
## $ DEFAULT : int 0 0 0 0 0 0 1 0 0 0 ...
payments_gam <- gam(DEFAULT ~ s(LIMIT_BAL) + SEX + MARRIAGE + AGE + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) ,data =payments_train )
summary(payments_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + MARRIAGE + AGE + s(PAY_0) + s(BILL_AMT1) +
## s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1929612 0.0156241 12.350 < 2e-16 ***
## SEXM 0.0279813 0.0078294 3.574 0.000353 ***
## MARRIAGESingle 0.0116421 0.0085311 1.365 0.172392
## AGE 0.0002544 0.0004656 0.546 0.584833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.104 8.671 18.009 <2e-16 ***
## s(PAY_0) 8.656 8.931 223.283 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 29.393 <2e-16 ***
## s(PAY_AMT1) 2.713 3.420 2.928 0.026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.7%
## GCV = 0.13617 Scale est. = 0.13582 n = 9600
#Marriage and Age are not statistically significant as their P-Values are greater than .05
set.seed(1234)
payments_gam1 <- gam(DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) ,data =payments_train )
summary(payments_gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.207319 0.004877 42.511 < 2e-16 ***
## SEXM 0.028122 0.007778 3.616 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.173 8.713 17.544 <2e-16 ***
## s(PAY_0) 8.664 8.934 224.247 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 30.982 <2e-16 ***
## s(PAY_AMT1) 2.984 3.744 2.915 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.6%
## GCV = 0.13616 Scale est. = 0.13584 n = 9600
Marriage and Age are not statistically significant as their P-Values are greater that .05. After the new GAM model is ran removing these variable, all remaining variables are statistically significant.
set.seed(1234)
summary(payments_gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.207319 0.004877 42.511 < 2e-16 ***
## SEXM 0.028122 0.007778 3.616 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.173 8.713 17.544 <2e-16 ***
## s(PAY_0) 8.664 8.934 224.247 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 30.982 <2e-16 ***
## s(PAY_AMT1) 2.984 3.744 2.915 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.6%
## GCV = 0.13616 Scale est. = 0.13584 n = 9600
The only numeric value that is related linearly to DEFAULT is BILL_AMT1.
set.seed(1234)
plot(payments_gam1, pages = 1)
set.seed(1234)
pcut_gam <- 1/2
prob_payments_gam1 <- predict(payments_gam1, payments_test, type = "response")
pred_payments_gam1 <- (prob_payments_gam1 >= pcut_gam)*1
table(payments_test$DEFAULT, pred_payments_gam1, dnn = c("Observed", "Predicted"))
## Predicted
## Observed 0 1
## 0 1776 90
## 1 370 164