#HW2
#APPLIED LINEAR STATISTICAL MODELS
#26:960:577, Fall 2016
# RUID: 174004689
#(1)
#generate a random vector X of length n = 50 consisting of random numbers uniformly distributed over [0, 1].
X = runif(50, min = 0, max = 1)
#(2)
#Compute the "sample mean"
mean(X)
## [1] 0.4500246
#(3)
#for k = 100 times (generating k different copies of X), so that you obtain k estimates of the sample mean.
k = 100
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.
mu <- as.vector(k) #mean calculation
for(i in 1:k)
{
mu[i] = mean(X[i,]) #calculating k estimates of the sample mean
}
w1 = mu #Stacking them into a vector w
w1
## [1] 0.4833148 0.5379803 0.4241665 0.4839196 0.5353882 0.5234864 0.5088497
## [8] 0.4697069 0.4657661 0.5750400 0.5331238 0.4166225 0.4001437 0.4521163
## [15] 0.5964005 0.4665157 0.5735959 0.4568133 0.5610366 0.4774666 0.5394617
## [22] 0.4986431 0.4407367 0.4483632 0.4657443 0.4815746 0.5409750 0.5530416
## [29] 0.5921015 0.5474650 0.5765274 0.5079158 0.4748969 0.4983023 0.4656641
## [36] 0.5872027 0.5296216 0.5210533 0.5132189 0.4392648 0.5382709 0.5242223
## [43] 0.4809916 0.4982311 0.5739686 0.4692175 0.5084500 0.4994576 0.5777218
## [50] 0.5046906 0.4500914 0.4680174 0.5432364 0.5102596 0.4799138 0.4997459
## [57] 0.5084414 0.4752718 0.4444179 0.4664411 0.5566169 0.4806515 0.4962193
## [64] 0.4281657 0.5532876 0.5404744 0.4332333 0.4206919 0.4613509 0.5964503
## [71] 0.4542236 0.5176983 0.4856503 0.4451992 0.5754759 0.4672087 0.4860329
## [78] 0.4847805 0.5080683 0.5379051 0.5691409 0.4424466 0.5665093 0.5834919
## [85] 0.4744141 0.4648387 0.5210882 0.5184217 0.4942142 0.5294703 0.5016941
## [92] 0.5033375 0.4156738 0.5508334 0.5161423 0.5144654 0.4421934 0.4754283
## [99] 0.4968978 0.4386566
#Plotting the histogram of w when k=100:
hist(w1)

## From this histogram, we see that w, i.e. the collection of k estimates of mean is distributed between 0.35 to 0.65
#(4)
#mean when k=100
mean(w1)
## [1] 0.5013303
#variance when k=100
var(w1)
## [1] 0.002224352
#(5)
#When k = 5000
k = 5000
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.
mu <- as.vector(k) #mean calculation
for(i in 1:k)
{
mu[i] = mean(X[i,]) #calculating k estimates of the sample mean
}
w2 = mu #Stacking them into a vector w
#Plotting the histogram of w when k=5000:
hist(w2)

#mean when k=5000
mean(w2)
## [1] 0.499957
#mean when k=5000
var(w2)
## [1] 0.001707491
#(5)
#When k = 100000
k = 100000
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.
mu <- as.vector(k) #mean calculation
for(i in 1:k)
{
mu[i] = mean(X[i,]) #calculating k estimates of the sample mean
}
w3 = mu #Stacking them into a vector w
#Plotting the histogram of w when k=100000:
hist(w3)

#mean when k=100000
mean(w3)
## [1] 0.4998422
#mean when k=100000
var(w3)
## [1] 0.001673399
#(6)
#Plot the histogram of w-mean(w)/stdev(w) for k = 100; 5000; 100000:
#When k = 100
y1 = (w1 - mean(w1))/sd(w1)
hist(y1)

mean(y1)
## [1] -3.368139e-16
var(y1)
## [1] 1
y11 = dnorm(y1, mean = 0, sd = 1)
plot(y1, y11, type = "h", xlab = "function x when k = 100", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")

#When k = 5000
y2 = (w2 - mean(w2))/sd(w2)
hist(y2)

mean(y2)
## [1] 2.504488e-16
var(y2)
## [1] 1
y21 = dnorm(y2, mean = 0, sd = 1)
plot(y2, y21, type = "h", xlab = "function x when k = 5000", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")

#when k = 100000
y3 = (w3 - mean(w3))/sd(w3)
hist(y3)

mean(y3)
## [1] 3.484901e-16
var(y3)
## [1] 1
y31 = dnorm(y3, mean = 0, sd = 1)
plot(y3, y31, type = "h", xlab = "function x when k = 100000", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")
## Thus, we observe that the histogram gets closer to the density of the Gaussian distribution with mean 0 and variance 1 as k gets larger.
#(7)
#The dataset teengamb concerns a study of teenage gambling in Britain. Fit a regression
#model with the expenditure on gambling as the response and the sex, status, income
#and verbal score as predictors. Present the output.
library("faraway")

data("teengamb")
teengamb
## sex status income verbal gamble
## 1 1 51 2.00 8 0.00
## 2 1 28 2.50 8 0.00
## 3 1 37 2.00 6 0.00
## 4 1 28 7.00 4 7.30
## 5 1 65 2.00 8 19.60
## 6 1 61 3.47 6 0.10
## 7 1 28 5.50 7 1.45
## 8 1 27 6.42 5 6.60
## 9 1 43 2.00 6 1.70
## 10 1 18 6.00 7 0.10
## 11 1 18 3.00 6 0.10
## 12 1 43 4.75 6 5.40
## 13 1 30 2.20 4 1.20
## 14 1 28 2.00 6 3.60
## 15 1 38 3.00 6 2.40
## 16 1 38 1.50 8 3.40
## 17 1 28 9.50 8 0.10
## 18 1 18 10.00 5 8.40
## 19 1 43 4.00 8 12.00
## 20 0 51 3.50 9 0.00
## 21 0 62 3.00 8 1.00
## 22 0 47 2.50 9 1.20
## 23 0 43 3.50 5 0.10
## 24 0 27 10.00 4 156.00
## 25 0 71 6.50 7 38.50
## 26 0 38 1.50 7 2.10
## 27 0 51 5.44 4 14.50
## 28 0 38 1.00 6 3.00
## 29 0 51 0.60 7 0.60
## 30 0 62 5.50 8 9.60
## 31 0 18 12.00 2 88.00
## 32 0 30 7.00 7 53.20
## 33 0 38 15.00 7 90.00
## 34 0 71 2.00 10 3.00
## 35 0 28 1.50 1 14.10
## 36 0 61 4.50 8 70.00
## 37 0 71 2.50 7 38.50
## 38 0 28 8.00 6 57.20
## 39 0 51 10.00 6 6.00
## 40 0 65 1.60 6 25.00
## 41 0 48 2.00 9 6.90
## 42 0 61 15.00 9 69.70
## 43 0 75 3.00 8 13.30
## 44 0 66 3.25 9 0.60
## 45 0 62 4.94 6 38.00
## 46 0 71 1.50 7 14.40
## 47 0 71 2.50 9 19.20
teengamb$sex <- factor(teengamb$sex)
levels(teengamb$sex) <- c("male","female")
summary(teengamb)
## sex status income verbal
## male :28 Min. :18.00 Min. : 0.600 Min. : 1.00
## female:19 1st Qu.:28.00 1st Qu.: 2.000 1st Qu.: 6.00
## Median :43.00 Median : 3.250 Median : 7.00
## Mean :45.23 Mean : 4.642 Mean : 6.66
## 3rd Qu.:61.50 3rd Qu.: 6.210 3rd Qu.: 8.00
## Max. :75.00 Max. :15.000 Max. :10.00
## gamble
## Min. : 0.0
## 1st Qu.: 1.1
## Median : 6.0
## Mean : 19.3
## 3rd Qu.: 19.4
## Max. :156.0
regmodel <- lm (gamble ~ sex + status + income + verbal, data=teengamb)
summary(regmodel)
##
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sexfemale -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
#(a) What percentage of variation in the response is explained by these predictors?
## The coefficient of determination or percentage of variance is nothing but R^2:
## The percentage of variation in the response is given by the Multiple R-squared, which is 52.67%
#(b) Which observation has the largest (positive) residual? Give the case number.
which.max(regmodel$residuals)
## 24
## 24
## The 24th case has the largest residual.
#(c) Compute the mean and median of the residuals.
mean(regmodel$residuals)
## [1] -3.065293e-17
median(regmodel$residuals)
## [1] -1.451392
#(d) Compute the correlation of the residuals with the fitted values.
cor(regmodel$residuals, regmodel$fitted.values)
## [1] -1.070659e-16
#(e) Compute the correlation of the residuals with the income.
cor(regmodel$residuals, teengamb$income)
## [1] -7.242382e-17
#(f) For all other predictors held constant, what would be the difference in predicted
#expenditure on gambling for a male compared to a female?
## As we can see from the summary of our model:
summary(regmodel)
##
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sexfemale -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
regmodel$coefficients
## (Intercept) sexfemale status income verbal
## 22.55565063 -22.11833009 0.05223384 4.96197922 -2.95949350
## Coefficient of sexfemale: -22.1183301
## Thus, we can say that on an average, teenage females spend $22.1183301 less on gamble than teenage males (when other predictors are being held constant).