Final Exam

setwd("C:/Users/12267/Desktop/UWindsor/Winter 2021/MSCI 3230 Data Science Tools & Methods/RSTUDIO Work")

df <- read.csv("data/cruise.csv")

library(ggplot2)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(reshape)
library (tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.4

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::rename() masks reshape::rename()

library (Hmisc)

## Warning: package 'Hmisc' was built under R version 4.0.4

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.0.4

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(leaps)

## Warning: package 'leaps' was built under R version 4.0.4

library(caret)

## Warning: package 'caret' was built under R version 4.0.4

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

library(gains)
library(pROC)

## Warning: package 'pROC' was built under R version 4.0.4

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

List all library

#### Read the data
t(t(names(df))) ##display column names in a user friendly manner

##      [,1]        
## [1,] "sex"       
## [2,] "retired"   
## [3,] "cruises"   
## [4,] "income"    
## [5,] "expense"   
## [6,] "expensecat"

#change to lowercase if so desired
names(df) <- tolower(names(df))

Read the data and display column names within dataset

ggplot(df, aes(x = expense, y = income)) + geom_point()

By running this function, as enpense increases, income increases

ggplot(df, aes(x = as.factor (sex), y = expense, fill = factor (sex))) + 
  geom_boxplot(outlier.color = "red", outlier.shape = "o")+
  labs(title = "Expense vs. Sex", x = "Sex", y = "Expense (in 000s)")

aggregate(expense ~ retired, data = df, mean)

##   retired  expense
## 1       0 1110.333
## 2       1 1643.067

reg3 <- lm(income ~ .-sex -retired -expensecat, data = df) 
options(scipen = 999)
summary(reg3)

## 
## Call:
## lm(formula = income ~ . - sex - retired - expensecat, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11555  -4894  -1063   5051  13813 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 64357.59    3013.47  21.357 < 0.0000000000000002 ***
## cruises     -3621.71    2055.31  -1.762               0.0894 .  
## expense        14.32       2.84   5.043            0.0000271 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6878 on 27 degrees of freedom
## Multiple R-squared:  0.5903, Adjusted R-squared:   0.56 
## F-statistic: 19.45 on 2 and 27 DF,  p-value: 0.000005856

reg4 <- lm(income ~ .-sex -expensecat, data = df) 
options(scipen = 999)
summary(reg4)

## 
## Call:
## lm(formula = income ~ . - sex - expensecat, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11569  -4905  -1061   5058  13798 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 64353.704   3096.582  20.782 < 0.0000000000000002 ***
## retired        26.864   2751.502   0.010               0.9923    
## cruises     -3622.004   2094.671  -1.729               0.0956 .  
## expense        14.314      2.969   4.822            0.0000538 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7009 on 26 degrees of freedom
## Multiple R-squared:  0.5904, Adjusted R-squared:  0.5431 
## F-statistic: 12.49 on 3 and 26 DF,  p-value: 0.00003013

reg1 <- lm(income ~ .-expensecat, data = df) 
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(reg1)

## 
## Call:
## lm(formula = income ~ . - expensecat, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11725  -4844  -1044   5001  13634 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 64480.57    3303.68  19.518 < 0.0000000000000002 ***
## sex          -345.58    2653.08  -0.130                0.897    
## retired        24.44    2805.10   0.009                0.993    
## cruises     -3574.37    2166.51  -1.650                0.111    
## expense        14.27       3.05   4.677             0.000086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7145 on 25 degrees of freedom
## Multiple R-squared:  0.5906, Adjusted R-squared:  0.5251 
## F-statistic: 9.017 on 4 and 25 DF,  p-value: 0.0001188

set.seed(1)  # set seed for reproducing the partition

train.rows <- sample(1:nrow(df), 18)

train.df <- df[train.rows, ]
train.df <- as.data.frame(train.df)#add on
valid.df <- df[-train.rows, ]
valid.df <- as.data.frame(valid.df)
reg2 <- lm(income ~ .-expensecat, data = train.df) #added-id
summary(reg2)

## 
## Call:
## lm(formula = income ~ . - expensecat, data = train.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6374  -4638  -1982   4060  11911 
## 
## Coefficients:
##              Estimate Std. Error t value       Pr(>|t|)    
## (Intercept) 65415.036   3632.968  18.006 0.000000000142 ***
## sex         -6733.443   3151.761  -2.136         0.0522 .  
## retired      5845.539   3605.890   1.621         0.1290    
## cruises     -2080.574   2473.318  -0.841         0.4154    
## expense        10.493      3.716   2.824         0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6345 on 13 degrees of freedom
## Multiple R-squared:  0.7015, Adjusted R-squared:  0.6097 
## F-statistic: 7.639 on 4 and 13 DF,  p-value: 0.002147

library(forecast)
# use predict() to make predictions on a new set. 
lm.pred <- predict(reg2, valid.df)
options(scipen=999, digits = 0)
some.residuals <- valid.df$income[1:5] - lm.pred[1:5]
data.frame("Actual" = valid.df$income[1:5],"Predicted" = lm.pred[1:5], 
           "Residual" = some.residuals)

##    Actual Predicted Residual
## 3   89162     92163    -3001
## 6   80410     72690     7720
## 8   85680     78952     6728
## 12  87210     75672    11538
## 13  82818     79073     3745

options(scipen=999, digits = 3)

t(accuracy(lm.pred, valid.df$income))

##      Test set
## ME     2732.6
## RMSE  10492.3
## MAE    9168.9
## MPE       2.3
## MAPE     12.2

all.residuals <- valid.df$income - lm.pred

library(leaps)
search <- regsubsets(income ~ .-expensecat, data = train.df, nbest = 1, nvmax = dim(train.df)[2],
                     method = "exhaustive")
sum <- summary(search)
# show models
t(sum$which)

##                 1     2     3    4
## (Intercept)  TRUE  TRUE  TRUE TRUE
## sex         FALSE  TRUE  TRUE TRUE
## retired     FALSE FALSE  TRUE TRUE
## cruises     FALSE FALSE FALSE TRUE
## expense      TRUE  TRUE  TRUE TRUE

sum$rsq

## [1] 0.523 0.611 0.685 0.702

sum$adjr2

## [1] 0.493 0.559 0.618 0.610

x <- which(sum$adjr2== max(sum$adjr2))
t(t(sum$which[x,]))

##              [,1]
## (Intercept)  TRUE
## sex          TRUE
## retired      TRUE
## cruises     FALSE
## expense      TRUE

##Change the colname to 'actual' from 'expense'
names(df)[names(df) == 'expense'] <- 'actual'

reg5 <- glm(expensecat~ .-actual, data = df, family = "binomial") 
options(scipen=999)
summary(reg5)

## 
## Call:
## glm(formula = expensecat ~ . - actual, family = "binomial", data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3166  -0.2265  -0.0905   0.3347   1.6201  
## 
## Coefficients:
##                Estimate  Std. Error z value Pr(>|z|)  
## (Intercept) -19.3112685   7.6297618   -2.53    0.011 *
## sex          -0.4548740   1.2516182   -0.36    0.716  
## retired       1.5759226   1.3153745    1.20    0.231  
## cruises       1.1363375   0.6938439    1.64    0.101  
## income        0.0002074   0.0000901    2.30    0.021 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41.455  on 29  degrees of freedom
## Residual deviance: 17.053  on 25  degrees of freedom
## AIC: 27.05
## 
## Number of Fisher Scoring iterations: 6

psuedoR2 <- 1-reg5$deviance/reg5$null.deviance
psuedoR2

## [1] 0.589

#create a data frame
Propensity <- c(2775,2657, 2587, 2587, 2263, 1930, 1920, 1885, 1711, 1700, 1675, 1657, 1500, 1467,
                1347, 1300, 1184, 1157, 1133, 1100, 1090,  924,  703,  582,  574,  388,  386,  375,  373,  371)
Actual <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
df <- cbind(Propensity, Actual)
df <- as.data.frame(df)

options(digits = 7)
confusionMatrix(factor(ifelse(df$Propensity>0.5, 1, 0)),
                factor(df$Actual), positive = "1")

## Warning in confusionMatrix.default(factor(ifelse(df$Propensity > 0.5, 1, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  0  0
##          1 16 14
##                                           
##                Accuracy : 0.4667          
##                  95% CI : (0.2834, 0.6567)
##     No Information Rate : 0.5333          
##     P-Value [Acc > NIR] : 0.8199299       
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.0001768       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.4667          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.4667          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
##

Final Exam

Hassan Al Hakim

19/04/2021

List all library

Read the data and display column names within dataset

By running this function, as enpense increases, income increases