library(readr)
final <- read_csv("C:/Users/cbado/Downloads/cruise.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## sex = col_double(),
## retired = col_double(),
## cruises = col_double(),
## income = col_double(),
## expense = col_double(),
## expensecat = col_logical()
## )
View(final)
library(ggplot2)
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.4
## A.1.
ggplot(final, aes(x=income, y=expense)) + geom_point()+ggtitle("Cruise Expenditure by Income")+xlab("Income")+ylab("Average Amount Spent Per Cruise")+scale_y_continuous(labels=scales::dollar_format())+scale_x_continuous(labels=scales::dollar_format())

ggplot(final, aes(x=factor(sex), y=expense, fill=factor(sex)))+ geom_boxplot()+ggtitle("Cruise Expenditure by Gender")+xlab("Sex")+ylab("Average Amount Spent Per Cruise")+scale_y_continuous(labels=scales::dollar_format())+scale_x_discrete(labels = c('Female','Male'))+labs(fill = "Sex")

mean(final$expense[final$retired=="1"])
## [1] 1643.067
mean(final$expense[final$retired=="0"])
## [1] 1110.333
A.2a
lm1 = lm(expense~sex+retired+cruises+income, data=final)
summary(lm1)
##
## Call:
## lm(formula = expense ~ sex + retired + cruises + income, data = final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -608.03 -187.90 -70.66 255.77 694.54
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.038e+03 4.903e+02 -4.156 0.000331 ***
## sex -4.603e+01 1.268e+02 -0.363 0.719550
## retired 1.075e+02 1.326e+02 0.810 0.425406
## cruises 4.100e+02 7.219e+01 5.680 6.49e-06 ***
## income 3.272e-02 6.995e-03 4.677 8.60e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 342.2 on 25 degrees of freedom
## Multiple R-squared: 0.8148, Adjusted R-squared: 0.7851
## F-statistic: 27.49 on 4 and 25 DF, p-value: 7.852e-09
lm2 = lm(expense~cruises+income, data=final)
summary(lm2)
##
## Call:
## lm(formula = expense ~ cruises + income, data = final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -544.07 -231.95 -42.23 261.92 730.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.111e+03 4.682e+02 -4.509 0.000114 ***
## cruises 4.182e+02 6.828e+01 6.125 1.52e-06 ***
## income 3.387e-02 6.716e-03 5.043 2.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 334.5 on 27 degrees of freedom
## Multiple R-squared: 0.8088, Adjusted R-squared: 0.7947
## F-statistic: 57.12 on 2 and 27 DF, p-value: 1.989e-10
lm3 = lm(expense~cruises, data=final)
summary(lm3)
##
## Call:
## lm(formula = expense ~ cruises, data = final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -931.0 -288.8 -128.6 217.6 1003.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 133.19 198.96 0.669 0.509
## cruises 573.93 83.34 6.887 1.74e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.7 on 28 degrees of freedom
## Multiple R-squared: 0.6288, Adjusted R-squared: 0.6155
## F-statistic: 47.43 on 1 and 28 DF, p-value: 1.744e-07
The best model is the regression model that uses number of cruises and income to predict expense amount. This model has the lowest adjusted R^2 value and all predictors are statistically signifcant
A.2b
sett <- c(1:5)
cut(sett, breaks = c(0, mean(sett), Inf), labels = c("0", "1"))
## [1] 0 0 0 1 1
## Levels: 0 1
final <- subset (final, select = -c(expensecat))
expensecat <- cut(final$expense, breaks = c(0, mean(final$expense), Inf), labels = c("0", "1"))
final <- cbind(final, expensecat)
View(final)
lin <- glm(expensecat ~ income, data = final, family = "binomial")
lin
##
## Call: glm(formula = expensecat ~ income, family = "binomial", data = final)
##
## Coefficients:
## (Intercept) income
## -1.758e+01 2.232e-04
##
## Degrees of Freedom: 29 Total (i.e. Null); 28 Residual
## Null Deviance: 41.46
## Residual Deviance: 23.29 AIC: 27.29