require(faraway)
## Loading required package: faraway
data(teengamb)
summary(teengamb)
## sex status income verbal
## Min. :0.0000 Min. :18.00 Min. : 0.600 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:28.00 1st Qu.: 2.000 1st Qu.: 6.00
## Median :0.0000 Median :43.00 Median : 3.250 Median : 7.00
## Mean :0.4043 Mean :45.23 Mean : 4.642 Mean : 6.66
## 3rd Qu.:1.0000 3rd Qu.:61.50 3rd Qu.: 6.210 3rd Qu.: 8.00
## Max. :1.0000 Max. :75.00 Max. :15.000 Max. :10.00
## gamble
## Min. : 0.0
## 1st Qu.: 1.1
## Median : 6.0
## Mean : 19.3
## 3rd Qu.: 19.4
## Max. :156.0
*Based on this data, we see the various quartiles for each predictor variable along with the mean and median.
verbal_percentage <- 100*(teengamb$verbal)/12
x <- sort(verbal_percentage)
hist(x, col= "purple", xlab = "Verbal Percentage Correct", main="Histogram of Verbal % Correct")
*Based on this data, we see that the highest verbal score is a 10/12 and the lowest is 1/12. The mean score is 6.66.
hist(teengamb$sex, col = "red", xlab= "MALE and FEMALE", main="Histogram of frequency of Each Gender")
*Based on this data, we see that there are more teen men gambling than women!
plot(teengamb$income, teengamb$gamble, main = "Income vs. Gambling Scatterplot", pch=16, col="blue", xlab="Income in pounds/week", ylab = "Expenditure in gambling in pounds/year")
*Based on this plot, we can hypothesize that as an individual’s income increases, they are more likely to expend more on gambling.
sex_gamble <- lm(gamble ~ sex, data=teengamb)
summary(sex_gamble)
##
## Call:
## lm(formula = gamble ~ sex, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.775 -18.325 -3.766 6.334 126.225
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.775 5.498 5.415 2.28e-06 ***
## sex -25.909 8.648 -2.996 0.00444 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.09 on 45 degrees of freedom
## Multiple R-squared: 0.1663, Adjusted R-squared: 0.1478
## F-statistic: 8.977 on 1 and 45 DF, p-value: 0.004437
status_gamble <-lm(gamble ~ status, data=teengamb)
summary(status_gamble)
##
## Call:
## lm(formula = gamble ~ status, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.708 -17.903 -13.929 2.195 135.020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.46486 13.14189 1.786 0.0809 .
## status -0.09205 0.27180 -0.339 0.7364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.82 on 45 degrees of freedom
## Multiple R-squared: 0.002542, Adjusted R-squared: -0.01962
## F-statistic: 0.1147 on 1 and 45 DF, p-value: 0.7364
income_gamble<- lm(gamble ~ income, data=teengamb)
summary(income_gamble)
##
## Call:
## lm(formula = gamble ~ income, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.020 -11.874 -3.757 11.934 107.120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.325 6.030 -1.049 0.3
## income 5.520 1.036 5.330 3.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.95 on 45 degrees of freedom
## Multiple R-squared: 0.387, Adjusted R-squared: 0.3734
## F-statistic: 28.41 on 1 and 45 DF, p-value: 3.045e-06
verbal_gamble<- lm(gamble ~ verbal, data=teengamb)
summary(verbal_gamble)
##
## Call:
## lm(formula = gamble ~ verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.036 -18.047 -13.294 4.271 126.764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.178 17.053 2.591 0.0129 *
## verbal -3.736 2.469 -1.513 0.1372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.08 on 45 degrees of freedom
## Multiple R-squared: 0.04842, Adjusted R-squared: 0.02728
## F-statistic: 2.29 on 1 and 45 DF, p-value: 0.1372
plot(fitted(status_gamble), residuals(status_gamble),
xlab="Predicted Scores", ylab="Residuals", main="Residuals Vs. Predicted (Status)", pch=16, col="green")
fit <- lm(gamble ~ sex + status + income + verbal, data=teengamb)
summary(fit)
##
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
Part D i.) Based on the Multiple R-squared value of 0.5267, we can say that 52.7% of variation in gambling is explained by these sex, status, income, and verbal scores.
Part D ii.) The mean of the residuals is -3.065293e-17 and the median of the residuals is -1.451392.
mean(resid(fit))
## [1] -3.065293e-17
median(resid(fit))
## [1] -1.451392
cor(fit$residuals,fit$fitted.values)
## [1] -1.070659e-16
cor(fit$residuals,teengamb$income)
## [1] -7.242382e-17
# gamble = 22.55565 (minus) 22.11833 x sex + 0.05223 x status + 4.96198 x income (minus) 2.95949 x verbal
set.seed(1234)
x <- runif(100,0,10)
y <- 3 + x + x^2 + rnorm(100,0,1)
#First model (not the true model)
m1 <- lm(y~x)
#Second Model (the true model)
m2 <- lm(y~x+I(x^2)) #(Note the need for the I() function in fitting the polynomial)
par(mfrow=c(1,1),cex=1.25)
plot(m1$fitted.values, m1$residuals,xlab="Predicted Values from M1", ylab="Residuals", pch=16, col="orange", main="Predicted vs. Residuals (M1)")
abline(0,0)
par(mfrow=c(1,1),cex=1.25)
plot(m2$fitted.values, m2$residuals, xlab="Predicted Values from M2", ylab="Residuals", pch=16, col="pink", main="Predicted vs. Residuals (M2)")
abline(0,0)