library(asbio)
## Loading required package: tcltk
#PART 1
data(so2.us.cities)
attach(so2.us.cities)
# B)
bmatrix <- (cbind(rep(1,nrow(so2.us.cities)),so2.us.cities$X1,so2.us.cities$X2,so2.us.cities$X3,so2.us.cities$X4,
so2.us.cities$X5,so2.us.cities$X6))
beta_hat <- {solve((t(bmatrix) %*% bmatrix))} %*% {t(bmatrix) %*% Y}
beta_hat
## [,1]
## [1,] 113.79004167
## [2,] -1.16748383
## [3,] 0.05437428
## [4,] -0.02752411
## [5,] -4.11045229
## [6,] 0.41696158
## [7,] -0.04833085
# c)
y_hat <- bmatrix %*% beta_hat
y_hat
## [,1]
## [1,] 3.815575
## [2,] 25.580588
## [3,] 22.133322
## [4,] 27.957768
## [5,] 49.471516
## [6,] 27.172845
## [7,] 22.279613
## [8,] 7.702336
## [9,] 9.433796
## [10,] 25.512800
## [11,] 109.529328
## [12,] 22.260441
## [13,] 18.554036
## [14,] 3.467876
## [15,] 26.292826
## [16,] 18.888771
## [17,] 31.052657
## [18,] 36.901742
## [19,] 41.158045
## [20,] 26.462353
## [21,] 44.388485
## [22,] 16.999052
## [23,] 7.115872
## [24,] 28.630677
## [25,] 23.382189
## [26,] 44.112709
## [27,] 52.144667
## [28,] 26.870690
## [29,] 14.779697
## [30,] 9.097800
## [31,] 23.116327
## [32,] 29.733599
#D)
n <- length(Y)
YN <-matrix(1,n,n)
SSR <- {t(beta_hat)%*%t(bmatrix)%*%Y}-((1/n) %*% t(Y) %*% (YN) %*% Y )
SSR
## [,1]
## [1,] 11841.25
# E)
SSE <- {t(Y) %*% Y} - {t(beta_hat) %*% t(bmatrix) %*% Y}
SSE
## [,1]
## [1,] 2640.253
n1= length(Y)
#F)
p<- 7
n<- length(Y)
MSE <- (t(Y) %*% Y - t(beta_hat) %*% t(bmatrix) %*% Y)/(n-p)
MSE
## [,1]
## [1,] 105.6101
MSR1 <- ((t(beta_hat) %*% t(bmatrix) %*% Y) - 1/n * t(Y) %*% Y)/(p-1)
MSR1
## [,1]
## [1,] 5769.968
f.star1 <- MSR1/MSE
f.star1
## [,1]
## [1,] 54.63462
p_values1 <- 2* pt(abs(f.star1),n-p,lower.tail = F)
p_values1
## [,1]
## [1,] 1.558039e-27
# G)
e <- Y - y_hat
e
## [,1]
## [1,] 6.1844252
## [2,] -12.5805881
## [3,] -10.1333216
## [4,] -10.9577685
## [5,] 6.5284835
## [6,] 8.8271546
## [7,] 6.7203875
## [8,] 6.2976640
## [9,] 0.5662036
## [10,] -1.5128000
## [11,] 0.4706715
## [12,] 5.7395593
## [13,] -1.5540357
## [14,] 4.5321240
## [15,] 3.7071736
## [16,] -9.8887708
## [17,] 15.9473435
## [18,] -1.9017424
## [19,] -12.1580450
## [20,] -12.4623535
## [21,] 11.6115155
## [22,] -2.9990517
## [23,] 3.8841281
## [24,] 17.3693227
## [25,] -12.3821894
## [26,] -21.1127094
## [27,] 12.8553332
## [28,] -0.8706902
## [29,] -5.7796973
## [30,] 0.9021996
## [31,] 4.8836730
## [32,] -0.7335989
# K)
MSE <- (t(Y) %*% Y - t(beta_hat) %*% t(bmatrix) %*% Y)/(n-p)
MSE
## [,1]
## [1,] 105.6101
MSR2 <- ((t(beta_hat) %*% t(bmatrix) %*% Y) - 1/n * t(Y) %*% matrix(1,nrow=n, ncol=n) %*% Y)/(p-1)
MSR2
## [,1]
## [1,] 1973.541
F.star<- MSR2/MSE
F.star
## [,1]
## [1,] 18.68705
# F-statistic is 18.69
p_value <- pf(F.star, p-1, n-p, lower.tail = F)
p_value
## [,1]
## [1,] 3.895529e-08
# p-value is 3.90e^-8
# The conclusion is that the p-value are highly significant
# L)
q1 <- lm(Y~X1+X2+X3+X4+X5+X6,data = so2.us.cities)
summary.lm(q1)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6, data = so2.us.cities)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.1127 -6.8070 0.5184 6.2127 17.3693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.79004 34.65450 3.284 0.00303 **
## X1 -1.16748 0.45555 -2.563 0.01679 *
## X2 0.05437 0.01158 4.697 8.18e-05 ***
## X3 -0.02752 0.01130 -2.435 0.02238 *
## X4 -4.11045 1.48548 -2.767 0.01049 *
## X5 0.41696 0.27798 1.500 0.14615
## X6 -0.04833 0.12414 -0.389 0.70032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.28 on 25 degrees of freedom
## Multiple R-squared: 0.8177, Adjusted R-squared: 0.7739
## F-statistic: 18.69 on 6 and 25 DF, p-value: 3.896e-08
model.matrix(q1)
## (Intercept) X1 X2 X3 X4 X5 X6
## 1 1 70.3 213 582 6.0 7.05 36
## 2 1 61.0 91 132 8.2 48.52 100
## 3 1 56.7 453 716 8.7 20.66 67
## 4 1 51.9 454 515 9.0 12.95 86
## 5 1 49.1 412 158 9.0 43.37 127
## 6 1 54.0 80 80 9.0 40.25 114
## 7 1 57.3 434 757 9.3 38.89 111
## 8 1 68.4 136 529 8.8 54.47 116
## 9 1 75.5 207 335 9.0 59.80 128
## 10 1 61.5 368 497 9.1 48.34 115
## 11 1 50.6 3344 3369 10.4 34.44 122
## 12 1 52.3 361 746 9.7 38.74 121
## 13 1 49.0 104 201 11.2 30.85 103
## 14 1 56.6 125 277 12.7 30.58 82
## 15 1 55.6 291 593 8.3 43.11 123
## 16 1 68.3 204 361 8.4 56.77 113
## 17 1 55.0 625 905 9.6 41.31 111
## 18 1 49.9 1064 1513 10.1 30.96 129
## 19 1 43.5 699 744 10.6 25.94 137
## 20 1 54.5 381 507 10.0 37.00 99
## 21 1 55.9 775 622 9.5 35.89 105
## 22 1 51.5 181 347 10.9 30.18 98
## 23 1 56.8 46 244 8.9 7.77 58
## 24 1 47.6 44 116 8.8 33.36 135
## 25 1 47.1 391 463 12.4 36.11 166
## 26 1 54.0 462 453 7.1 39.04 132
## 27 1 49.7 1007 751 10.9 34.99 155
## 28 1 51.5 266 540 8.6 37.01 134
## 29 1 66.2 641 844 10.9 35.94 73
## 30 1 68.9 721 1233 10.8 48.19 108
## 31 1 51.0 137 176 8.7 15.17 89
## 32 1 51.1 379 531 9.4 38.79 164
## attr(,"assign")
## [1] 0 1 2 3 4 5 6
#PART 2
regre<- read.table(file='regression.tsv', header= T)
regre
## dose metabolite
## 1 0 9.448641
## 2 10 -13.072656
## 3 20 -3.099901
## 4 30 -43.604798
## 5 40 -38.188878
## 6 50 -66.090357
## 7 60 -68.096225
## 8 70 -58.032302
## 9 80 -43.007988
## 10 90 -70.039365
## 11 100 -104.625353
plot(regre)
# what type of plot is this? - decreasing linear - negative slope
# linear regression
as.matrix(regre)
## dose metabolite
## [1,] 0 9.448641
## [2,] 10 -13.072656
## [3,] 20 -3.099901
## [4,] 30 -43.604798
## [5,] 40 -38.188878
## [6,] 50 -66.090357
## [7,] 60 -68.096225
## [8,] 70 -58.032302
## [9,] 80 -43.007988
## [10,] 90 -70.039365
## [11,] 100 -104.625353
regre.lm <- lm(regre$metabolite~regre$dose, data = regre)
# What is the best fitting line equation?
#y_hat = -0.91-0.89Xi+ ei
#e ~N(16.49^2)
summary.lm(regre.lm)
##
## Call:
## lm(formula = regre$metabolite ~ regre$dose, data = regre)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.780 -14.413 -1.758 10.575 28.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9134 9.3039 -0.098 0.923945
## regre$dose -0.8879 0.1573 -5.646 0.000315 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.49 on 9 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7554
## F-statistic: 31.88 on 1 and 9 DF, p-value: 0.0003152
abline(regre.lm)
# Check linear model assumptions
plot(regre.lm)
resid(regre.lm)
## 1 2 3 4 5 6
## 10.362048 -3.279946 15.572113 -16.053480 -1.758256 -20.780431
## 7 8 9 10 11
## -13.906996 5.036231 28.939849 10.787776 -14.918908
fitted(regre.lm)
## 1 2 3 4 5 6
## -0.9134067 -9.7927105 -18.6720144 -27.5513182 -36.4306220 -45.3099259
## 7 8 9 10 11
## -54.1892297 -63.0685335 -71.9478373 -80.8271412 -89.7064450
par(mfrow=c(2,2))
plot(regre.lm, which=1:4)
plot(regre$dose,regre$metabolite)
lines(regre$dose,fitted(regre.lm))
# Analyze the result of the model fit:
# The linear models showed in the residuals vs fitted that they are not that separated apart
# in the normal Q-Q plot we could see a nice alignment with the elements with the line
# in the Cooks plot but the 9 looks like an outlier
# The R-quare is ~78% which is a good indicator of conident data
# We dont have to look at the adjusted R-squared , because that is for multiple regressions
# Also the p-value is 0.0003152 which is excelent because it is even lower than 0.01
# Something to notice is that the point 9 it looks like an outlier in some most plots
# not that detrimental in the Q-Q plot though
# I will test extracting point 9 below
par(mfrow=c(2,2))
plot(regre.lm, which=1:4)
summary(lm(regre$metabolite~regre$dose,data=regre,subset=-9))
##
## Call:
## lm(formula = regre$metabolite ~ regre$dose, data = regre, subset = -9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.600 -9.072 -1.724 9.786 17.784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6767 7.5872 0.089 0.931
## regre$dose -0.9833 0.1339 -7.343 8.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.4 on 8 degrees of freedom
## Multiple R-squared: 0.8708, Adjusted R-squared: 0.8546
## F-statistic: 53.92 on 1 and 8 DF, p-value: 8.048e-05
# Looks like by extracting the point 9 I have increased the R squared to 87%
# Also the p-value has decreased so its excellent!!
# Even in the residual vs fitted plot I can see a better line drawn in the
#diagnostic plots
```