setwd("C:/Users/12267/Desktop/UWindsor/Winter 2021/MSCI 3230 Data Science Tools & Methods/RSTUDIO Work")
df <- read.csv("data/cruise.csv")
library(ggplot2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(reshape)
library (tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::rename() masks reshape::rename()
library (Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.4
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.4
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
library(gains)
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#### Read the data
t(t(names(df))) ##display column names in a user friendly manner
## [,1]
## [1,] "sex"
## [2,] "retired"
## [3,] "cruises"
## [4,] "income"
## [5,] "expense"
## [6,] "expensecat"
#change to lowercase if so desired
names(df) <- tolower(names(df))
ggplot(df, aes(x = expense, y = income)) + geom_point()
ggplot(df, aes(x = as.factor (sex), y = expense, fill = factor (sex))) +
geom_boxplot(outlier.color = "red", outlier.shape = "o")+
labs(title = "Expense vs. Sex", x = "Sex", y = "Expense (in 000s)")
aggregate(expense ~ retired, data = df, mean)
## retired expense
## 1 0 1110.333
## 2 1 1643.067
reg3 <- lm(income ~ .-sex -retired -expensecat, data = df)
options(scipen = 999)
summary(reg3)
##
## Call:
## lm(formula = income ~ . - sex - retired - expensecat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11555 -4894 -1063 5051 13813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64357.59 3013.47 21.357 < 0.0000000000000002 ***
## cruises -3621.71 2055.31 -1.762 0.0894 .
## expense 14.32 2.84 5.043 0.0000271 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6878 on 27 degrees of freedom
## Multiple R-squared: 0.5903, Adjusted R-squared: 0.56
## F-statistic: 19.45 on 2 and 27 DF, p-value: 0.000005856
reg4 <- lm(income ~ .-sex -expensecat, data = df)
options(scipen = 999)
summary(reg4)
##
## Call:
## lm(formula = income ~ . - sex - expensecat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11569 -4905 -1061 5058 13798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64353.704 3096.582 20.782 < 0.0000000000000002 ***
## retired 26.864 2751.502 0.010 0.9923
## cruises -3622.004 2094.671 -1.729 0.0956 .
## expense 14.314 2.969 4.822 0.0000538 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7009 on 26 degrees of freedom
## Multiple R-squared: 0.5904, Adjusted R-squared: 0.5431
## F-statistic: 12.49 on 3 and 26 DF, p-value: 0.00003013
.
reg1 <- lm(income ~ .-expensecat, data = df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(reg1)
##
## Call:
## lm(formula = income ~ . - expensecat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11725 -4844 -1044 5001 13634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64480.57 3303.68 19.518 < 0.0000000000000002 ***
## sex -345.58 2653.08 -0.130 0.897
## retired 24.44 2805.10 0.009 0.993
## cruises -3574.37 2166.51 -1.650 0.111
## expense 14.27 3.05 4.677 0.000086 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7145 on 25 degrees of freedom
## Multiple R-squared: 0.5906, Adjusted R-squared: 0.5251
## F-statistic: 9.017 on 4 and 25 DF, p-value: 0.0001188
set.seed(1) # set seed for reproducing the partition
train.rows <- sample(1:nrow(df), 18)
train.df <- df[train.rows, ]
train.df <- as.data.frame(train.df)#add on
valid.df <- df[-train.rows, ]
valid.df <- as.data.frame(valid.df)
reg2 <- lm(income ~ .-expensecat, data = train.df) #added-id
summary(reg2)
##
## Call:
## lm(formula = income ~ . - expensecat, data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6374 -4638 -1982 4060 11911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65415.036 3632.968 18.006 0.000000000142 ***
## sex -6733.443 3151.761 -2.136 0.0522 .
## retired 5845.539 3605.890 1.621 0.1290
## cruises -2080.574 2473.318 -0.841 0.4154
## expense 10.493 3.716 2.824 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6345 on 13 degrees of freedom
## Multiple R-squared: 0.7015, Adjusted R-squared: 0.6097
## F-statistic: 7.639 on 4 and 13 DF, p-value: 0.002147
library(forecast)
# use predict() to make predictions on a new set.
lm.pred <- predict(reg2, valid.df)
options(scipen=999, digits = 0)
some.residuals <- valid.df$income[1:5] - lm.pred[1:5]
data.frame("Actual" = valid.df$income[1:5],"Predicted" = lm.pred[1:5],
"Residual" = some.residuals)
## Actual Predicted Residual
## 3 89162 92163 -3001
## 6 80410 72690 7720
## 8 85680 78952 6728
## 12 87210 75672 11538
## 13 82818 79073 3745
options(scipen=999, digits = 3)
t(accuracy(lm.pred, valid.df$income))
## Test set
## ME 2732.6
## RMSE 10492.3
## MAE 9168.9
## MPE 2.3
## MAPE 12.2
all.residuals <- valid.df$income - lm.pred
library(leaps)
search <- regsubsets(income ~ .-expensecat, data = train.df, nbest = 1, nvmax = dim(train.df)[2],
method = "exhaustive")
sum <- summary(search)
# show models
t(sum$which)
## 1 2 3 4
## (Intercept) TRUE TRUE TRUE TRUE
## sex FALSE TRUE TRUE TRUE
## retired FALSE FALSE TRUE TRUE
## cruises FALSE FALSE FALSE TRUE
## expense TRUE TRUE TRUE TRUE
sum$rsq
## [1] 0.523 0.611 0.685 0.702
sum$adjr2
## [1] 0.493 0.559 0.618 0.610
x <- which(sum$adjr2== max(sum$adjr2))
t(t(sum$which[x,]))
## [,1]
## (Intercept) TRUE
## sex TRUE
## retired TRUE
## cruises FALSE
## expense TRUE
##Change the colname to 'actual' from 'expense'
names(df)[names(df) == 'expense'] <- 'actual'
reg5 <- glm(expensecat~ .-actual, data = df, family = "binomial")
options(scipen=999)
summary(reg5)
##
## Call:
## glm(formula = expensecat ~ . - actual, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3166 -0.2265 -0.0905 0.3347 1.6201
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.3112685 7.6297618 -2.53 0.011 *
## sex -0.4548740 1.2516182 -0.36 0.716
## retired 1.5759226 1.3153745 1.20 0.231
## cruises 1.1363375 0.6938439 1.64 0.101
## income 0.0002074 0.0000901 2.30 0.021 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41.455 on 29 degrees of freedom
## Residual deviance: 17.053 on 25 degrees of freedom
## AIC: 27.05
##
## Number of Fisher Scoring iterations: 6
psuedoR2 <- 1-reg5$deviance/reg5$null.deviance
psuedoR2
## [1] 0.589
#create a data frame
Propensity <- c(2775,2657, 2587, 2587, 2263, 1930, 1920, 1885, 1711, 1700, 1675, 1657, 1500, 1467,
1347, 1300, 1184, 1157, 1133, 1100, 1090, 924, 703, 582, 574, 388, 386, 375, 373, 371)
Actual <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
df <- cbind(Propensity, Actual)
df <- as.data.frame(df)
options(digits = 7)
confusionMatrix(factor(ifelse(df$Propensity>0.5, 1, 0)),
factor(df$Actual), positive = "1")
## Warning in confusionMatrix.default(factor(ifelse(df$Propensity > 0.5, 1, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 16 14
##
## Accuracy : 0.4667
## 95% CI : (0.2834, 0.6567)
## No Information Rate : 0.5333
## P-Value [Acc > NIR] : 0.8199299
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.0001768
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.4667
## Neg Pred Value : NaN
## Prevalence : 0.4667
## Detection Rate : 0.4667
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##