STAT 308

require(faraway)

## Loading required package: faraway

data(teengamb)

Question 1 Part A

Here are some numerical and graphical statistics regarding the Teen Gambling dataset:

summary(teengamb)

##       sex             status          income           verbal     
##  Min.   :0.0000   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##  Median :0.0000   Median :43.00   Median : 3.250   Median : 7.00  
##  Mean   :0.4043   Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##  3rd Qu.:1.0000   3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##  Max.   :1.0000   Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0

*Based on this data, we see the various quartiles for each predictor variable along with the mean and median.

verbal_percentage <- 100*(teengamb$verbal)/12
x <- sort(verbal_percentage)
hist(x, col= "purple", xlab = "Verbal Percentage Correct", main="Histogram of Verbal % Correct")

*Based on this data, we see that the highest verbal score is a 10/12 and the lowest is 1/12. The mean score is 6.66.

hist(teengamb$sex, col = "red", xlab= "MALE and FEMALE", main="Histogram of frequency of Each Gender")

*Based on this data, we see that there are more teen men gambling than women!

plot(teengamb$income, teengamb$gamble, main = "Income vs. Gambling Scatterplot", pch=16, col="blue", xlab="Income in pounds/week", ylab = "Expenditure in gambling in pounds/year")

*Based on this plot, we can hypothesize that as an individual’s income increases, they are more likely to expend more on gambling.

Question 1 Part B:

sex_gamble <- lm(gamble ~ sex, data=teengamb)
summary(sex_gamble)

## 
## Call:
## lm(formula = gamble ~ sex, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.775 -18.325  -3.766   6.334 126.225 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   29.775      5.498   5.415 2.28e-06 ***
## sex          -25.909      8.648  -2.996  0.00444 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.09 on 45 degrees of freedom
## Multiple R-squared:  0.1663, Adjusted R-squared:  0.1478 
## F-statistic: 8.977 on 1 and 45 DF,  p-value: 0.004437

status_gamble <-lm(gamble ~ status, data=teengamb)
summary(status_gamble)

## 
## Call:
## lm(formula = gamble ~ status, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.708 -17.903 -13.929   2.195 135.020 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 23.46486   13.14189   1.786   0.0809 .
## status      -0.09205    0.27180  -0.339   0.7364  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.82 on 45 degrees of freedom
## Multiple R-squared:  0.002542,   Adjusted R-squared:  -0.01962 
## F-statistic: 0.1147 on 1 and 45 DF,  p-value: 0.7364

income_gamble<- lm(gamble ~ income, data=teengamb)
summary(income_gamble)

## 
## Call:
## lm(formula = gamble ~ income, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.020 -11.874  -3.757  11.934 107.120 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.325      6.030  -1.049      0.3    
## income         5.520      1.036   5.330 3.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.95 on 45 degrees of freedom
## Multiple R-squared:  0.387,  Adjusted R-squared:  0.3734 
## F-statistic: 28.41 on 1 and 45 DF,  p-value: 3.045e-06

verbal_gamble<- lm(gamble ~ verbal, data=teengamb)
summary(verbal_gamble)

## 
## Call:
## lm(formula = gamble ~ verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.036 -18.047 -13.294   4.271 126.764 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   44.178     17.053   2.591   0.0129 *
## verbal        -3.736      2.469  -1.513   0.1372  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.08 on 45 degrees of freedom
## Multiple R-squared:  0.04842,    Adjusted R-squared:  0.02728 
## F-statistic:  2.29 on 1 and 45 DF,  p-value: 0.1372

Question 1 Part C:

plot(fitted(status_gamble), residuals(status_gamble),
 xlab="Predicted Scores", ylab="Residuals", main="Residuals Vs. Predicted (Status)", pch=16, col="green")

Question 1 Part D:

fit <- lm(gamble ~ sex + status + income + verbal, data=teengamb)
summary(fit)

## 
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sex         -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06

Part D i.) Based on the Multiple R-squared value of 0.5267, we can say that 52.7% of variation in gambling is explained by these sex, status, income, and verbal scores.
Part D ii.) The mean of the residuals is -3.065293e-17 and the median of the residuals is -1.451392.

mean(resid(fit))

## [1] -3.065293e-17

median(resid(fit))

## [1] -1.451392

Part D iii.) The correlation of the residuals with the fitted models is -1.070659e-16.

cor(fit$residuals,fit$fitted.values)

## [1] -1.070659e-16

Part D. iv.) The correlation of the residuals with the income is -7.242382e-17

cor(fit$residuals,teengamb$income)

## [1] -7.242382e-17

Part D. v.) Based on the summary from 1.d.i), the fitted model can be written as:

# gamble = 22.55565 (minus) 22.11833 x sex + 0.05223 x status + 4.96198 x income (minus) 2.95949 x verbal

So, if all the other predictors besides sex are constant, then the difference in predicted expenditure on gambling between men (where the sex=0) and women (where the sex=1) will be -22.11833, which is the regression coefficient. This implies that when the sex changes, the value of gambling decreases by 22.11833. So, when sex=1 and the teen is a women, she spends 22.55565 - 22.11833*1, which is $22.11833 a man (sex=0)

Question 2

set.seed(1234)
x <- runif(100,0,10)
y <- 3 + x + x^2 + rnorm(100,0,1)

#First model (not the true model)
m1 <- lm(y~x)
#Second Model (the true model)
m2 <- lm(y~x+I(x^2)) #(Note the need for the I() function in fitting the polynomial)

par(mfrow=c(1,1),cex=1.25)
plot(m1$fitted.values, m1$residuals,xlab="Predicted Values from M1", ylab="Residuals", pch=16, col="orange", main="Predicted vs. Residuals (M1)")
abline(0,0)

par(mfrow=c(1,1),cex=1.25)
plot(m2$fitted.values, m2$residuals, xlab="Predicted Values from M2", ylab="Residuals", pch=16, col="pink", main="Predicted vs. Residuals (M2)")
abline(0,0)

Based on the residual plots, we see the following for each model:
- For M1, the graph of the residuals vs. the predicted values has a quadratic shape. The points are curved upwards and the residuals are not random. I would argue that the pattern is non-linear, and therefore the predictions are not helpful in making any arguments between the response and predictor variables.
- For M2, the graph of the residuals vs. the predicted values seems completely random, however it does have a slight heteroscedastic pattern (i.e. the left half of the graph has more points than the right half and the points slightly taper off).