#HW2
#APPLIED LINEAR STATISTICAL MODELS
#26:960:577, Fall 2016
# RUID: 174004689

#(1)
#generate a random vector X of length n = 50 consisting of random numbers uniformly distributed over [0, 1].

X = runif(50, min = 0, max = 1)

#(2)
#Compute the "sample mean"

mean(X)
## [1] 0.4500246
#(3)
#for k = 100 times (generating k different copies of X), so that you obtain k estimates of the sample mean.

k = 100
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.

mu <- as.vector(k) #mean calculation

for(i in 1:k)
{
  mu[i] = mean(X[i,])  #calculating k estimates of the sample mean
}

w1 = mu #Stacking them into a vector w

w1
##   [1] 0.4833148 0.5379803 0.4241665 0.4839196 0.5353882 0.5234864 0.5088497
##   [8] 0.4697069 0.4657661 0.5750400 0.5331238 0.4166225 0.4001437 0.4521163
##  [15] 0.5964005 0.4665157 0.5735959 0.4568133 0.5610366 0.4774666 0.5394617
##  [22] 0.4986431 0.4407367 0.4483632 0.4657443 0.4815746 0.5409750 0.5530416
##  [29] 0.5921015 0.5474650 0.5765274 0.5079158 0.4748969 0.4983023 0.4656641
##  [36] 0.5872027 0.5296216 0.5210533 0.5132189 0.4392648 0.5382709 0.5242223
##  [43] 0.4809916 0.4982311 0.5739686 0.4692175 0.5084500 0.4994576 0.5777218
##  [50] 0.5046906 0.4500914 0.4680174 0.5432364 0.5102596 0.4799138 0.4997459
##  [57] 0.5084414 0.4752718 0.4444179 0.4664411 0.5566169 0.4806515 0.4962193
##  [64] 0.4281657 0.5532876 0.5404744 0.4332333 0.4206919 0.4613509 0.5964503
##  [71] 0.4542236 0.5176983 0.4856503 0.4451992 0.5754759 0.4672087 0.4860329
##  [78] 0.4847805 0.5080683 0.5379051 0.5691409 0.4424466 0.5665093 0.5834919
##  [85] 0.4744141 0.4648387 0.5210882 0.5184217 0.4942142 0.5294703 0.5016941
##  [92] 0.5033375 0.4156738 0.5508334 0.5161423 0.5144654 0.4421934 0.4754283
##  [99] 0.4968978 0.4386566
#Plotting the histogram of w when k=100:
hist(w1)

## From this histogram, we see that w, i.e. the collection of k estimates of mean is distributed between 0.35 to 0.65

#(4)

#mean when k=100
mean(w1)
## [1] 0.5013303
#variance when k=100
var(w1)
## [1] 0.002224352
#(5)
#When k = 5000

k = 5000
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.

mu <- as.vector(k) #mean calculation

for(i in 1:k)
{
  mu[i] = mean(X[i,])  #calculating k estimates of the sample mean
}

w2 = mu #Stacking them into a vector w


#Plotting the histogram of w when k=5000:
hist(w2)

#mean when k=5000
mean(w2)
## [1] 0.499957
#mean when k=5000
var(w2)
## [1] 0.001707491
#(5)
#When k = 100000

k = 100000
X = runif(50*k, min = 0, max = 1)
X <- matrix(X, nrow = k) #100 rows of X each containing 50 random numbers.

mu <- as.vector(k) #mean calculation

for(i in 1:k)
{
  mu[i] = mean(X[i,])  #calculating k estimates of the sample mean
}

w3 = mu #Stacking them into a vector w

#Plotting the histogram of w when k=100000:
hist(w3)

#mean when k=100000
mean(w3)
## [1] 0.4998422
#mean when k=100000
var(w3)
## [1] 0.001673399
#(6)
#Plot the histogram of w-mean(w)/stdev(w) for k = 100; 5000; 100000:

#When k = 100

y1 = (w1 - mean(w1))/sd(w1)
hist(y1)

mean(y1)
## [1] -3.368139e-16
var(y1)
## [1] 1
y11 = dnorm(y1, mean = 0, sd = 1)

plot(y1, y11, type = "h", xlab = "function x when k = 100", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")

#When k = 5000

y2 = (w2 - mean(w2))/sd(w2)
hist(y2)

mean(y2)
## [1] 2.504488e-16
var(y2)
## [1] 1
y21 = dnorm(y2, mean = 0, sd = 1)

plot(y2, y21, type = "h", xlab = "function x when k = 5000", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")

#when k = 100000

y3 = (w3 - mean(w3))/sd(w3)
hist(y3)

mean(y3)
## [1] 3.484901e-16
var(y3)
## [1] 1
y31 = dnorm(y3, mean = 0, sd = 1)

plot(y3, y31, type = "h", xlab = "function x when k = 100000", ylab = "Normal Distribution" , main = "Density of the Gaussian distribution with mean = 0 and variance = 0")

## Thus, we observe that the histogram gets closer to the density of the Gaussian distribution with mean 0 and variance 1 as k gets larger.




#(7)

#The dataset teengamb concerns a study of teenage gambling in Britain. Fit a regression
#model with the expenditure on gambling as the response and the sex, status, income
#and verbal score as predictors. Present the output.

library("faraway")

data("teengamb")
teengamb
##    sex status income verbal gamble
## 1    1     51   2.00      8   0.00
## 2    1     28   2.50      8   0.00
## 3    1     37   2.00      6   0.00
## 4    1     28   7.00      4   7.30
## 5    1     65   2.00      8  19.60
## 6    1     61   3.47      6   0.10
## 7    1     28   5.50      7   1.45
## 8    1     27   6.42      5   6.60
## 9    1     43   2.00      6   1.70
## 10   1     18   6.00      7   0.10
## 11   1     18   3.00      6   0.10
## 12   1     43   4.75      6   5.40
## 13   1     30   2.20      4   1.20
## 14   1     28   2.00      6   3.60
## 15   1     38   3.00      6   2.40
## 16   1     38   1.50      8   3.40
## 17   1     28   9.50      8   0.10
## 18   1     18  10.00      5   8.40
## 19   1     43   4.00      8  12.00
## 20   0     51   3.50      9   0.00
## 21   0     62   3.00      8   1.00
## 22   0     47   2.50      9   1.20
## 23   0     43   3.50      5   0.10
## 24   0     27  10.00      4 156.00
## 25   0     71   6.50      7  38.50
## 26   0     38   1.50      7   2.10
## 27   0     51   5.44      4  14.50
## 28   0     38   1.00      6   3.00
## 29   0     51   0.60      7   0.60
## 30   0     62   5.50      8   9.60
## 31   0     18  12.00      2  88.00
## 32   0     30   7.00      7  53.20
## 33   0     38  15.00      7  90.00
## 34   0     71   2.00     10   3.00
## 35   0     28   1.50      1  14.10
## 36   0     61   4.50      8  70.00
## 37   0     71   2.50      7  38.50
## 38   0     28   8.00      6  57.20
## 39   0     51  10.00      6   6.00
## 40   0     65   1.60      6  25.00
## 41   0     48   2.00      9   6.90
## 42   0     61  15.00      9  69.70
## 43   0     75   3.00      8  13.30
## 44   0     66   3.25      9   0.60
## 45   0     62   4.94      6  38.00
## 46   0     71   1.50      7  14.40
## 47   0     71   2.50      9  19.20
teengamb$sex <- factor(teengamb$sex)
levels(teengamb$sex) <- c("male","female")

summary(teengamb)
##      sex         status          income           verbal     
##  male  :28   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  female:19   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##              Median :43.00   Median : 3.250   Median : 7.00  
##              Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##              3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##              Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0
regmodel <- lm (gamble ~ sex + status + income + verbal, data=teengamb)

summary(regmodel)
## 
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sexfemale   -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06
#(a) What percentage of variation in the response is explained by these predictors?

## The coefficient of determination or percentage of variance is nothing but R^2:

## The percentage of variation in the response is given by the Multiple R-squared, which is 52.67%

#(b) Which observation has the largest (positive) residual? Give the case number.

which.max(regmodel$residuals) 
## 24 
## 24
## The 24th case has the largest residual.

#(c) Compute the mean and median of the residuals.

mean(regmodel$residuals)
## [1] -3.065293e-17
median(regmodel$residuals)
## [1] -1.451392
#(d) Compute the correlation of the residuals with the fitted values.

cor(regmodel$residuals, regmodel$fitted.values)
## [1] -1.070659e-16
#(e) Compute the correlation of the residuals with the income.

cor(regmodel$residuals, teengamb$income)
## [1] -7.242382e-17
#(f) For all other predictors held constant, what would be the difference in predicted
#expenditure on gambling for a male compared to a female?

## As we can see from the summary of our model:

summary(regmodel)
## 
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sexfemale   -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06
regmodel$coefficients
##  (Intercept)    sexfemale       status       income       verbal 
##  22.55565063 -22.11833009   0.05223384   4.96197922  -2.95949350
## Coefficient of sexfemale: -22.1183301

## Thus, we can say that on an average, teenage females spend $22.1183301 less on gamble than teenage males (when other predictors are being held constant).