require(faraway)
## Loading required package: faraway
data(teengamb)

Question 1 Part A

summary(teengamb)
##       sex             status          income           verbal     
##  Min.   :0.0000   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##  Median :0.0000   Median :43.00   Median : 3.250   Median : 7.00  
##  Mean   :0.4043   Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##  3rd Qu.:1.0000   3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##  Max.   :1.0000   Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0

*Based on this data, we see the various quartiles for each predictor variable along with the mean and median.

verbal_percentage <- 100*(teengamb$verbal)/12
x <- sort(verbal_percentage)
hist(x, col= "purple", xlab = "Verbal Percentage Correct", main="Histogram of Verbal % Correct")

*Based on this data, we see that the highest verbal score is a 10/12 and the lowest is 1/12. The mean score is 6.66.

hist(teengamb$sex, col = "red", xlab= "MALE and FEMALE", main="Histogram of frequency of Each Gender")

*Based on this data, we see that there are more teen men gambling than women!

plot(teengamb$income, teengamb$gamble, main = "Income vs. Gambling Scatterplot", pch=16, col="blue", xlab="Income in pounds/week", ylab = "Expenditure in gambling in pounds/year")

*Based on this plot, we can hypothesize that as an individual’s income increases, they are more likely to expend more on gambling.

Question 1 Part B:

sex_gamble <- lm(gamble ~ sex, data=teengamb)
summary(sex_gamble)
## 
## Call:
## lm(formula = gamble ~ sex, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.775 -18.325  -3.766   6.334 126.225 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   29.775      5.498   5.415 2.28e-06 ***
## sex          -25.909      8.648  -2.996  0.00444 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.09 on 45 degrees of freedom
## Multiple R-squared:  0.1663, Adjusted R-squared:  0.1478 
## F-statistic: 8.977 on 1 and 45 DF,  p-value: 0.004437
status_gamble <-lm(gamble ~ status, data=teengamb)
summary(status_gamble)
## 
## Call:
## lm(formula = gamble ~ status, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.708 -17.903 -13.929   2.195 135.020 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 23.46486   13.14189   1.786   0.0809 .
## status      -0.09205    0.27180  -0.339   0.7364  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.82 on 45 degrees of freedom
## Multiple R-squared:  0.002542,   Adjusted R-squared:  -0.01962 
## F-statistic: 0.1147 on 1 and 45 DF,  p-value: 0.7364
income_gamble<- lm(gamble ~ income, data=teengamb)
summary(income_gamble)
## 
## Call:
## lm(formula = gamble ~ income, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.020 -11.874  -3.757  11.934 107.120 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.325      6.030  -1.049      0.3    
## income         5.520      1.036   5.330 3.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.95 on 45 degrees of freedom
## Multiple R-squared:  0.387,  Adjusted R-squared:  0.3734 
## F-statistic: 28.41 on 1 and 45 DF,  p-value: 3.045e-06
verbal_gamble<- lm(gamble ~ verbal, data=teengamb)
summary(verbal_gamble)
## 
## Call:
## lm(formula = gamble ~ verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.036 -18.047 -13.294   4.271 126.764 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   44.178     17.053   2.591   0.0129 *
## verbal        -3.736      2.469  -1.513   0.1372  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.08 on 45 degrees of freedom
## Multiple R-squared:  0.04842,    Adjusted R-squared:  0.02728 
## F-statistic:  2.29 on 1 and 45 DF,  p-value: 0.1372

Question 1 Part C:

plot(fitted(status_gamble), residuals(status_gamble),
 xlab="Predicted Scores", ylab="Residuals", main="Residuals Vs. Predicted (Status)", pch=16, col="green")

Question 1 Part D:

fit <- lm(gamble ~ sex + status + income + verbal, data=teengamb)
summary(fit) 
## 
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sex         -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06
mean(resid(fit))
## [1] -3.065293e-17
median(resid(fit))
## [1] -1.451392
cor(fit$residuals,fit$fitted.values)
## [1] -1.070659e-16
cor(fit$residuals,teengamb$income)
## [1] -7.242382e-17
# gamble = 22.55565 (minus) 22.11833 x sex + 0.05223 x status + 4.96198 x income (minus) 2.95949 x verbal 

Question 2

set.seed(1234)
x <- runif(100,0,10)
y <- 3 + x + x^2 + rnorm(100,0,1)
#First model (not the true model)
m1 <- lm(y~x)
#Second Model (the true model)
m2 <- lm(y~x+I(x^2)) #(Note the need for the I() function in fitting the polynomial)

par(mfrow=c(1,1),cex=1.25)
plot(m1$fitted.values, m1$residuals,xlab="Predicted Values from M1", ylab="Residuals", pch=16, col="orange", main="Predicted vs. Residuals (M1)")
abline(0,0)

par(mfrow=c(1,1),cex=1.25)
plot(m2$fitted.values, m2$residuals, xlab="Predicted Values from M2", ylab="Residuals", pch=16, col="pink", main="Predicted vs. Residuals (M2)")
abline(0,0)