library(asbio)
## Loading required package: tcltk
#PART 1


data(so2.us.cities)
attach(so2.us.cities)

A) The null hypothesis is that the slope equals 0

# B)

bmatrix <- (cbind(rep(1,nrow(so2.us.cities)),so2.us.cities$X1,so2.us.cities$X2,so2.us.cities$X3,so2.us.cities$X4,
                  so2.us.cities$X5,so2.us.cities$X6))

beta_hat <- {solve((t(bmatrix) %*% bmatrix))} %*% {t(bmatrix) %*% Y}
beta_hat
##              [,1]
## [1,] 113.79004167
## [2,]  -1.16748383
## [3,]   0.05437428
## [4,]  -0.02752411
## [5,]  -4.11045229
## [6,]   0.41696158
## [7,]  -0.04833085
# c)

y_hat <- bmatrix %*% beta_hat
y_hat
##             [,1]
##  [1,]   3.815575
##  [2,]  25.580588
##  [3,]  22.133322
##  [4,]  27.957768
##  [5,]  49.471516
##  [6,]  27.172845
##  [7,]  22.279613
##  [8,]   7.702336
##  [9,]   9.433796
## [10,]  25.512800
## [11,] 109.529328
## [12,]  22.260441
## [13,]  18.554036
## [14,]   3.467876
## [15,]  26.292826
## [16,]  18.888771
## [17,]  31.052657
## [18,]  36.901742
## [19,]  41.158045
## [20,]  26.462353
## [21,]  44.388485
## [22,]  16.999052
## [23,]   7.115872
## [24,]  28.630677
## [25,]  23.382189
## [26,]  44.112709
## [27,]  52.144667
## [28,]  26.870690
## [29,]  14.779697
## [30,]   9.097800
## [31,]  23.116327
## [32,]  29.733599
#D)

n <- length(Y)
YN <-matrix(1,n,n)

SSR <- {t(beta_hat)%*%t(bmatrix)%*%Y}-((1/n) %*% t(Y) %*% (YN) %*% Y ) 
SSR
##          [,1]
## [1,] 11841.25
# E)

SSE <- {t(Y) %*% Y} - {t(beta_hat) %*% t(bmatrix) %*% Y}
SSE
##          [,1]
## [1,] 2640.253
n1= length(Y)
#F)

p<- 7
n<- length(Y)

MSE <- (t(Y) %*% Y - t(beta_hat) %*% t(bmatrix) %*% Y)/(n-p)
MSE
##          [,1]
## [1,] 105.6101
MSR1 <- ((t(beta_hat) %*% t(bmatrix) %*% Y) - 1/n * t(Y) %*% Y)/(p-1)
MSR1
##          [,1]
## [1,] 5769.968
f.star1 <- MSR1/MSE
f.star1
##          [,1]
## [1,] 54.63462
p_values1 <- 2* pt(abs(f.star1),n-p,lower.tail = F)
p_values1
##              [,1]
## [1,] 1.558039e-27
# G)

e <- Y - y_hat
e
##              [,1]
##  [1,]   6.1844252
##  [2,] -12.5805881
##  [3,] -10.1333216
##  [4,] -10.9577685
##  [5,]   6.5284835
##  [6,]   8.8271546
##  [7,]   6.7203875
##  [8,]   6.2976640
##  [9,]   0.5662036
## [10,]  -1.5128000
## [11,]   0.4706715
## [12,]   5.7395593
## [13,]  -1.5540357
## [14,]   4.5321240
## [15,]   3.7071736
## [16,]  -9.8887708
## [17,]  15.9473435
## [18,]  -1.9017424
## [19,] -12.1580450
## [20,] -12.4623535
## [21,]  11.6115155
## [22,]  -2.9990517
## [23,]   3.8841281
## [24,]  17.3693227
## [25,] -12.3821894
## [26,] -21.1127094
## [27,]  12.8553332
## [28,]  -0.8706902
## [29,]  -5.7796973
## [30,]   0.9021996
## [31,]   4.8836730
## [32,]  -0.7335989
# K)

MSE <- (t(Y) %*% Y - t(beta_hat) %*% t(bmatrix) %*% Y)/(n-p)
MSE
##          [,1]
## [1,] 105.6101
MSR2 <- ((t(beta_hat) %*% t(bmatrix) %*% Y) - 1/n * t(Y) %*% matrix(1,nrow=n, ncol=n) %*% Y)/(p-1)
MSR2
##          [,1]
## [1,] 1973.541
F.star<- MSR2/MSE
F.star
##          [,1]
## [1,] 18.68705
# F-statistic is 18.69

p_value <- pf(F.star, p-1, n-p, lower.tail = F)
p_value
##              [,1]
## [1,] 3.895529e-08
# p-value is 3.90e^-8

# The conclusion is that the p-value are highly significant
# L)

q1 <- lm(Y~X1+X2+X3+X4+X5+X6,data = so2.us.cities)
summary.lm(q1)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6, data = so2.us.cities)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.1127  -6.8070   0.5184   6.2127  17.3693 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 113.79004   34.65450   3.284  0.00303 ** 
## X1           -1.16748    0.45555  -2.563  0.01679 *  
## X2            0.05437    0.01158   4.697 8.18e-05 ***
## X3           -0.02752    0.01130  -2.435  0.02238 *  
## X4           -4.11045    1.48548  -2.767  0.01049 *  
## X5            0.41696    0.27798   1.500  0.14615    
## X6           -0.04833    0.12414  -0.389  0.70032    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.28 on 25 degrees of freedom
## Multiple R-squared:  0.8177, Adjusted R-squared:  0.7739 
## F-statistic: 18.69 on 6 and 25 DF,  p-value: 3.896e-08
model.matrix(q1)
##    (Intercept)   X1   X2   X3   X4    X5  X6
## 1            1 70.3  213  582  6.0  7.05  36
## 2            1 61.0   91  132  8.2 48.52 100
## 3            1 56.7  453  716  8.7 20.66  67
## 4            1 51.9  454  515  9.0 12.95  86
## 5            1 49.1  412  158  9.0 43.37 127
## 6            1 54.0   80   80  9.0 40.25 114
## 7            1 57.3  434  757  9.3 38.89 111
## 8            1 68.4  136  529  8.8 54.47 116
## 9            1 75.5  207  335  9.0 59.80 128
## 10           1 61.5  368  497  9.1 48.34 115
## 11           1 50.6 3344 3369 10.4 34.44 122
## 12           1 52.3  361  746  9.7 38.74 121
## 13           1 49.0  104  201 11.2 30.85 103
## 14           1 56.6  125  277 12.7 30.58  82
## 15           1 55.6  291  593  8.3 43.11 123
## 16           1 68.3  204  361  8.4 56.77 113
## 17           1 55.0  625  905  9.6 41.31 111
## 18           1 49.9 1064 1513 10.1 30.96 129
## 19           1 43.5  699  744 10.6 25.94 137
## 20           1 54.5  381  507 10.0 37.00  99
## 21           1 55.9  775  622  9.5 35.89 105
## 22           1 51.5  181  347 10.9 30.18  98
## 23           1 56.8   46  244  8.9  7.77  58
## 24           1 47.6   44  116  8.8 33.36 135
## 25           1 47.1  391  463 12.4 36.11 166
## 26           1 54.0  462  453  7.1 39.04 132
## 27           1 49.7 1007  751 10.9 34.99 155
## 28           1 51.5  266  540  8.6 37.01 134
## 29           1 66.2  641  844 10.9 35.94  73
## 30           1 68.9  721 1233 10.8 48.19 108
## 31           1 51.0  137  176  8.7 15.17  89
## 32           1 51.1  379  531  9.4 38.79 164
## attr(,"assign")
## [1] 0 1 2 3 4 5 6
#PART 2

regre<- read.table(file='regression.tsv', header= T)
regre
##    dose  metabolite
## 1     0    9.448641
## 2    10  -13.072656
## 3    20   -3.099901
## 4    30  -43.604798
## 5    40  -38.188878
## 6    50  -66.090357
## 7    60  -68.096225
## 8    70  -58.032302
## 9    80  -43.007988
## 10   90  -70.039365
## 11  100 -104.625353
plot(regre)

# what type of plot is this? - decreasing linear - negative slope

# linear regression 

as.matrix(regre)
##       dose  metabolite
##  [1,]    0    9.448641
##  [2,]   10  -13.072656
##  [3,]   20   -3.099901
##  [4,]   30  -43.604798
##  [5,]   40  -38.188878
##  [6,]   50  -66.090357
##  [7,]   60  -68.096225
##  [8,]   70  -58.032302
##  [9,]   80  -43.007988
## [10,]   90  -70.039365
## [11,]  100 -104.625353
regre.lm <- lm(regre$metabolite~regre$dose, data = regre)

# What is the best fitting line equation?
#y_hat = -0.91-0.89Xi+ ei
#e ~N(16.49^2)




summary.lm(regre.lm)
## 
## Call:
## lm(formula = regre$metabolite ~ regre$dose, data = regre)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.780 -14.413  -1.758  10.575  28.940 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.9134     9.3039  -0.098 0.923945    
## regre$dose   -0.8879     0.1573  -5.646 0.000315 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.49 on 9 degrees of freedom
## Multiple R-squared:  0.7798, Adjusted R-squared:  0.7554 
## F-statistic: 31.88 on 1 and 9 DF,  p-value: 0.0003152
abline(regre.lm)

# Check linear model assumptions

plot(regre.lm)

resid(regre.lm)
##          1          2          3          4          5          6 
##  10.362048  -3.279946  15.572113 -16.053480  -1.758256 -20.780431 
##          7          8          9         10         11 
## -13.906996   5.036231  28.939849  10.787776 -14.918908
fitted(regre.lm)
##           1           2           3           4           5           6 
##  -0.9134067  -9.7927105 -18.6720144 -27.5513182 -36.4306220 -45.3099259 
##           7           8           9          10          11 
## -54.1892297 -63.0685335 -71.9478373 -80.8271412 -89.7064450
par(mfrow=c(2,2))
plot(regre.lm, which=1:4)

plot(regre$dose,regre$metabolite)
lines(regre$dose,fitted(regre.lm))

# Analyze the result of the model fit:
# The linear models showed in the residuals vs fitted that they are not that separated apart
# in the normal Q-Q plot we could see a nice alignment with the elements with the line
# in the Cooks plot but the 9 looks like an outlier
# The R-quare is ~78% which is a good indicator of conident data
# We dont have to look at the adjusted R-squared , because that is for multiple regressions 
# Also the p-value is 0.0003152 which is excelent because it is even lower than 0.01
# Something to notice is that the point 9 it looks like an outlier in some most plots
# not that detrimental in the Q-Q plot though
# I will test extracting point 9 below

par(mfrow=c(2,2))
plot(regre.lm, which=1:4)

summary(lm(regre$metabolite~regre$dose,data=regre,subset=-9))
## 
## Call:
## lm(formula = regre$metabolite ~ regre$dose, data = regre, subset = -9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.600  -9.072  -1.724   9.786  17.784 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6767     7.5872   0.089    0.931    
## regre$dose   -0.9833     0.1339  -7.343 8.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 8 degrees of freedom
## Multiple R-squared:  0.8708, Adjusted R-squared:  0.8546 
## F-statistic: 53.92 on 1 and 8 DF,  p-value: 8.048e-05
# Looks like by extracting the point 9 I have increased the R squared to 87%
# Also the p-value has decreased so its excellent!!
# Even in the residual vs fitted plot I can see a better line drawn in the 
#diagnostic plots 

```