Tasks

Task 1. Basic statistics analysis

Get Data

myData <- read.csv("/Users/ruthokoilu/Documents/Fall 17/ CSC591/Project2/OKOILU RUTH.csv")
head(myData)
##          X1 X2          X3   X4           X5         Y
## 1  -1.01261  3 0.001361302  504 1.877810e-55  239.0384
## 2 155.18308  5 0.019201820 1400 1.589644e-08 1980.3923
## 3  90.64515  4 0.010076991  896 5.526546e-04 1193.7046
## 4  74.27032  3 0.005263104  504 2.403234e-25  886.8339
## 5  39.44549  3 0.003669509  504 4.761983e-05  605.3050
## 6  51.14948  2 0.001775451  224 6.342782e-12  532.1127
summary(myData)
##        X1                X2              X3                  X4        
##  Min.   :-145.04   Min.   :1.000   Min.   :1.887e-05   Min.   :  56.0  
##  1st Qu.:  27.53   1st Qu.:2.000   1st Qu.:4.651e-03   1st Qu.: 224.0  
##  Median :  65.41   Median :3.000   Median :1.208e-02   Median : 504.0  
##  Mean   :  63.29   Mean   :2.852   Mean   :1.827e-02   Mean   : 560.9  
##  3rd Qu.:  99.82   3rd Qu.:4.000   3rd Qu.:2.474e-02   3rd Qu.: 896.0  
##  Max.   : 229.08   Max.   :5.000   Max.   :1.518e-01   Max.   :1400.0  
##        X5                  Y          
##  Min.   :0.0000000   Min.   :-1188.6  
##  1st Qu.:0.0000000   1st Qu.:  465.5  
##  Median :0.0000000   Median :  809.3  
##  Mean   :0.0092162   Mean   :  819.6  
##  3rd Qu.:0.0000204   3rd Qu.: 1182.9  
##  Max.   :0.8897839   Max.   : 2179.2
library(ggplot2)

1.1. Calculating Histogram, Mean, and Variance for each variable Xi, i.e. column in the data set corresponding to Xi

Histogram

par(mfrow=c(1,1))
#qplot(myData[, 1], geom="histogram") 

qplot(myData[, 1],
      geom="histogram",
      binwidth = 5,  
      main = "Histogram for Column X1", 
      xlab = "X1",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2),
      xlim=c(20,50))
## Warning: Removed 422 rows containing non-finite values (stat_bin).

qplot(myData[, 2],
      geom="histogram",
      binwidth = 1,  
      main = "Histogram for Column X2", 
      xlab = "X2",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2))

qplot(myData[, 3],
      geom="histogram",
      binwidth = 0.005,  
      main = "Histogram for Column X3", 
      xlab = "X3",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2))

qplot(myData[, 4],
      geom="histogram",
      binwidth = 30,  
      main = "Histogram for Column X4", 
      xlab = "X4",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2))

qplot(myData[, 5],
      geom="histogram",
      binwidth = 0.01,  
      main = "Histogram for Column X5", 
      xlab = "X5",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2))

qplot(myData[, 6],
      geom="histogram",
      binwidth = 30,  
      main = "Histogram for Column Y", 
      xlab = "Y",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.2))

Histogram for Column Y seems to be normally distributed.

Calculate the Histogram(Frequecy of all columns):

#as.data.frame(table(myData[, 1]))  
as.data.frame(table(myData[, 2])) #can make factor
##   Var1 Freq
## 1    1  116
## 2    2   92
## 3    3  115
## 4    4  104
## 5    5   73
#as.data.frame(table(myData[, 3])) 
as.data.frame(table(myData[, 4])) #Can make factor
##   Var1 Freq
## 1   56  116
## 2  224   92
## 3  504  115
## 4  896  104
## 5 1400   73
#as.data.frame(table(myData[, 5])) 
#as.data.frame(table(myData[, 6])) 

Mean

options(scipen=9999)

Columns <- c("X1", "X2", "X3", "X4", "X5","Y")
mean_1 <- mean(myData[, 1])
mean_2 <- mean(myData[, 2])
mean_3 <- mean(myData[, 3])
mean_4 <- mean(myData[, 4])
mean_5 <- mean(myData[, 5])
mean_6 <- mean(myData[, 6])
Mean <- c(mean_1, mean_2, mean_3, mean_4, mean_5, mean_6)
all_means <- data.frame(Columns, Mean) 
all_means
##   Columns          Mean
## 1      X1  63.289457093
## 2      X2   2.852000000
## 3      X3   0.018268310
## 4      X4 560.896000000
## 5      X5   0.009216197
## 6       Y 819.612549513

Variance

Columns <- c("X1", "X2", "X3", "X4", "X5","Y")
var_1 <- var(myData[, 1])
var_2 <- var(myData[, 2])
var_3 <- var(myData[, 3])
var_4 <- var(myData[, 4])
var_5 <- var(myData[, 5])
var_6 <- var(myData[, 6])
Variance <- c(var_1, var_2, var_3, var_4, var_5, var_6)
all_vars <- data.frame(Columns, Variance) 
all_vars
##   Columns          Variance
## 1      X1   3364.7627563464
## 2      X2      1.8858677355
## 3      X3      0.0004089895
## 4      X4 207339.7005851703
## 5      X5      0.0030261775
## 6       Y 293780.4107939845

1.2 Boxplots to detect outliers

boxplot(myData[, 1], main = "Histogram for Column X1")

boxplot(myData[, 2], main = "Histogram for Column X2")

boxplot(myData[, 3], main = "Histogram for Column X3")

boxplot(myData[, 4], main = "Histogram for Column X4")

boxplot(myData[, 5], main = "Histogram for Column X5")

boxplot(myData[, 6], main = "Histogram for Column Y")

X2 has no outlier while X4 has a few.

Removing Outliers

removeOutliers <- function(newData){
  interQValues<- boxplot.stats(newData[,1])
  interQValues$stats[1]
  interQValues$stats[5]
  newData2 <- subset(newData, newData[,1]>interQValues$stats[1] & newData[,1]<interQValues$stats[5])
  return(newData2)
}

par(mfrow=c(1,2))
boxplot(myData[, 1], main="Before removing outliers")
#new_X1 <- removeOutliers(myData[,1])
new_X1 <- removeOutliers(myData)
boxplot(new_X1[,1], main="After removing outliers")

myData<-new_X1

# 1.3 Calculate the correlation matrix Σ among all variables, i.e., Y, X1, X2, X3, X4 and X5. Draw

cor(myData)
##              X1           X2            X3           X4          X5
## X1 1.0000000000  0.008191419  0.0003856156  0.020274771  0.06479151
## X2 0.0081914191  1.000000000 -0.0055883036  0.978999078  0.03277857
## X3 0.0003856156 -0.005588304  1.0000000000 -0.003600965 -0.05436637
## X4 0.0202747709  0.978999078 -0.0036009650  1.000000000  0.05516787
## X5 0.0647915069  0.032778570 -0.0543663701  0.055167871  1.00000000
## Y  0.9052673858  0.423443509 -0.0004322085  0.442592994  0.08244086
##                Y
## X1  0.9052673858
## X2  0.4234435093
## X3 -0.0004322085
## X4  0.4425929943
## X5  0.0824408552
## Y   1.0000000000

1.4 Comment on the results.

  • X1, and X2 can be used as predictors of Y because they have a significant correlation with Y and are likely to produce a good fit.

Task 2: Linear regression

Before proceeding with the multiple regression, you will carry out a simple linear regression to estimate the parameters of the model: Y = a0 + a1X + ε, where X = X1

linearModel <- lm(Y ~ X1, data= myData)
summary(linearModel)
## 
## Call:
## lm(formula = Y ~ X1, data = myData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -280.6 -179.8  -30.9  161.6  433.6 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 284.6896    15.1653   18.77 <0.0000000000000002 ***
## X1            8.4741     0.1791   47.31 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 222.2 on 493 degrees of freedom
## Multiple R-squared:  0.8195, Adjusted R-squared:  0.8191 
## F-statistic:  2238 on 1 and 493 DF,  p-value: < 0.00000000000000022

From the summary of the linear model above, we can see the details of the model created. The summary shows the statistics (Min, Max, Median, Interquatile values) of the residuals of the analysis. The residual is simply the error (Actual Y values (Y) - Predicted values of Y (Yhat) ) Residual\[ε hat = Y_i - Yhat_i\] # 2.1 To determine the values for a0, a1, and s2. The summary also shows the parameters of the model: Y = a0 + a1X + ε It can be seen that:

# What are the coefficients a0 and a1? 
#To print ot the coefficients

print(coef(linearModel)[1])
## (Intercept) 
##    284.6896
print(coef(linearModel)[2])
##       X1 
## 8.474147

a0 is 284.6896

a1 is 8.474147

The residual standard error is 222.2219 And σ2 is 49382.59

sigmaSd <- summary(linearModel)$sigma
sigmaSd^2
## [1] 49382.59

The residual variance S squared is shown above

2.2 Check the p-values, R2, F value to determine the regression coefficients

The p-value is

p_value <- anova(linearModel)$'Pr(>F)'[1]
p_value
## [1] 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002063461

The (Coefficient of determination) R2 is 0.819509. R squared goes from 0 to 1. If R2 is towards 0, it means bad fit but it’s towards 1, we have a good fit. It goes towards 1. This means we have a good fit.

rSquared <- summary(linearModel)$r.squared
rSquared
## [1] 0.819509

The F value shows us if we should reject or accept H0. If F > F critical value, then we reject H0 else we accept H0. F statistics is 2238.439 on 1.000 and 493.000 degrees of freedom.

f_value <- summary(linearModel)[10]
f_value
## $fstatistic
##    value    numdf    dendf 
## 2238.439    1.000  493.000

The adjusted R squared is used in feature selection to tell us what independent variables improve our model. The adjusted R squared value increases as important variables are added.

adj.r.squared <- summary(linearModel)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.8191429

2.3 Plot the regression line against the data.

By looking at it, regression line looks like a good fit.

2.4 Do a residuals analysis:

a. Do a Q-Q plot of the pdf of the residuals against N(0, s2)

The plot above shows that the residuals are not really normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.

2.7 Use a higher-order polynomial regression, i.e., Y = a0 + a1X + a2X2 + ε, to see if it gives better results.

## 
## Call:
## lm(formula = Y ~ X1 + X.squared, data = myData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -282.18 -179.30  -31.03  161.58  433.74 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 284.798920  15.260126   18.66 <0.0000000000000002 ***
## X1            8.453078   0.349348   24.20 <0.0000000000000002 ***
## X.squared     0.000172   0.002448    0.07               0.944    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 222.4 on 492 degrees of freedom
## Multiple R-squared:  0.8195, Adjusted R-squared:  0.8188 
## F-statistic:  1117 on 2 and 492 DF,  p-value: < 0.00000000000000022
## (Intercept) 
##    284.7989
##       X1 
## 8.453078
## [1] 222.4466
## [1] 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004852096
## [1] 0.8195109
## $fstatistic
##    value    numdf    dendf 
## 1116.963    2.000  492.000
## $adj.r.squared
## [1] 0.8187772
## [1] 0.8195109

2.8 Comment on your results in a couple of paragraphs.

a0 becomes 284.7989 and a1 is 8.453078, the p-value that was doubled and the f-value decreased by half. R squared is 0.8195109. This rsquared compared to the reqsuared of the previous analysis which was 0.819509, shows that using a polynomial in our analysis does not improve the fit.

Task 3. Multivariate regression

3.1 Carry out a multiple regression on all the independent variables, and determine the values for all the coefficients, and σ2.

multivariateModel <- lm(X1 ~ X2 + X3 + X5 , data= myData)
summary(multivariateModel)
## 
## Call:
## lm(formula = X1 ~ X2 + X3 + X5, data = myData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.602  -35.789    1.345   35.918  130.600 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  62.1940     6.2694   9.920 <0.0000000000000002 ***
## X2            0.2479     1.8345   0.135               0.893    
## X3           10.8480   124.0824   0.087               0.930    
## X5           65.4402    45.5653   1.436               0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.87 on 491 degrees of freedom
## Multiple R-squared:  0.00425,    Adjusted R-squared:  -0.001834 
## F-statistic: 0.6986 on 3 and 491 DF,  p-value: 0.5532
# Coefficients are:
print(coef(multivariateModel )[1])
## (Intercept) 
##    62.19397
print(coef(multivariateModel)[2])
##        X2 
## 0.2479108
print(coef(multivariateModel)[3])
##       X3 
## 10.84797
print(coef(multivariateModel)[4])
##       X5 
## 65.44023
#  σ2 is 

multSigmasd <- summary(multivariateModel)$sigma
#Standard Error
summary(multivariateModel)$sigma
## [1] 55.87247
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.8557389
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.004250296
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
##      value      numdf      dendf 
##   0.698601   3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.001833715
par(mfrow=c(2,2))
plot(multivariateModel)

par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres, 
     ylab="Standardized Residuals", 
     xlab="Normal Scores", 
    main="Q~Q Plot") 
 qqline(mmf.stdres)

plot(multivariateModel$residuals, main = "Residuals")

Observation on Multivariate regression: X1 ~ X2 + X3 + X5

Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.006478916. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X1 and any other independent variable. VIF = 1/(1-Rsquared). VIF will also be towards 1.

The Q-Q plot above shows that the residuals is normally distributed. This means that the PDF of residuals is identical to that of a standard normal distribution because we can see from the plot that left and right tail follows the 45 degree straight line.

From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.

multivariateModel <- lm(X2 ~ X1 + X3 + X5 , data= myData)
summary(multivariateModel)
## 
## Call:
## lm(formula = X2 ~ X1 + X3 + X5, data = myData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1830 -0.8670  0.1399  1.1414  2.1747 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  2.85041    0.10958  26.013 <0.0000000000000002 ***
## X1           0.00015    0.00111   0.135               0.893    
## X3          -0.25959    3.05250  -0.085               0.932    
## X5           0.79799    1.12271   0.711               0.478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.374 on 491 degrees of freedom
## Multiple R-squared:  0.001126,   Adjusted R-squared:  -0.004977 
## F-statistic: 0.1845 on 3 and 491 DF,  p-value: 0.9069
print(coef(multivariateModel )[1])
## (Intercept) 
##    2.850405
print(coef(multivariateModel)[2])
##           X1 
## 0.0001500324
print(coef(multivariateModel)[3])
##         X3 
## -0.2595929
#Standard Error squared
(summary(multivariateModel)$sigma)^2
## [1] 1.889232
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.8559622
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.001126119
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
##      value      numdf      dendf 
##   0.184516   3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.00497698
par(mfrow=c(2,2))
plot(multivariateModel)

par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres, 
     ylab="Standardized Residuals", 
     xlab="Normal Scores", 
    main="Q~Q Plot") 
 qqline(mmf.stdres)

plot(multivariateModel$residuals, main = "Residuals")

Observation on Multivariate regression: X2 ~ X1 + X3 + X5

Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.9590294. R squared is towards 1. This means we have a good fit and there is multicollinearity between X2 and another independent variable.VIF = 1/(1-Rsquared). VIF will also be towards infinity.

The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.

From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.

multivariateModel <- lm(X3 ~ X1+ X2 + X5 , data= myData)
summary(multivariateModel)
## 
## Call:
## lm(formula = X3 ~ X1 + X2 + X5, data = myData)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.018591 -0.013580 -0.006149  0.006425  0.133217 
## 
## Coefficients:
##                 Estimate   Std. Error t value           Pr(>|t|)    
## (Intercept)  0.018618470  0.002352726   7.914 0.0000000000000167 ***
## X1           0.000001435  0.000016414   0.087              0.930    
## X2          -0.000056741  0.000667201  -0.085              0.932    
## X5          -0.020001668  0.016582427  -1.206              0.228    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02032 on 491 degrees of freedom
## Multiple R-squared:  0.002986,   Adjusted R-squared:  -0.003106 
## F-statistic: 0.4901 on 3 and 491 DF,  p-value: 0.6893
print(coef(multivariateModel)[1])
## (Intercept) 
##  0.01861847
print(coef(multivariateModel)[2])
##            X1 
## 0.00000143496
print(coef(multivariateModel)[3])
##             X2 
## -0.00005674068
#Standard Error
summary(multivariateModel)$sigma
## [1] 0.02032093
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.9931757
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.002985726
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
##       value       numdf       dendf 
##   0.4901271   3.0000000 491.0000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.003106011
par(mfrow=c(2,2))
plot(multivariateModel)

par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres, 
     ylab="Standardized Residuals", 
     xlab="Normal Scores", 
    main=" Q~Q Plot") 
 qqline(mmf.stdres)

plot(multivariateModel$residuals, main = "Residuals")

Observation on Multivariate regression: X3 ~ X1 + X2 + X5

Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.003050009. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X3 and any other independent variable. VIF = 1/(1-Rsquared). VIF will also be towards 1.

The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.

From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.

multivariateModel <- lm(X5 ~ X1 + X2 + X3, data= myData)
summary(multivariateModel)
## 
## Call:
## lm(formula = X5 ~ X1 + X2 + X3, data = myData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.02072 -0.01217 -0.00851 -0.00470  0.87361 
## 
## Coefficients:
##                Estimate  Std. Error t value Pr(>|t|)
## (Intercept)  0.00426134  0.00678629   0.628    0.530
## X1           0.00006393  0.00004451   1.436    0.152
## X2           0.00128806  0.00181220   0.711    0.478
## X3          -0.14770784  0.12245751  -1.206    0.228
## 
## Residual standard error: 0.05522 on 491 degrees of freedom
## Multiple R-squared:  0.008177,   Adjusted R-squared:  0.002117 
## F-statistic: 1.349 on 3 and 491 DF,  p-value: 0.2577
print(coef(multivariateModel )[1])
## (Intercept) 
## 0.004261344
print(coef(multivariateModel)[2])
##            X1 
## 0.00006392538
print(coef(multivariateModel)[3])
##          X2 
## 0.001288059
print(coef(multivariateModel)[4])
##         X3 
## -0.1477078
#Standard Error squared
(summary(multivariateModel)$sigma)^2
## [1] 0.00304947
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.1500556
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.00817686
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
##      value      numdf      dendf 
##   1.349313   3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.002116841
par(mfrow=c(2,2))
plot(multivariateModel)

par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres, 
     ylab="Standardized Residuals", 
     xlab="Normal Scores", 
    main=" Q~Q Plot") 
 qqline(mmf.stdres)

plot(multivariateModel$residuals, main = "Residuals")

Observation on Multivariate regression: X5 ~ X1 + X2 + X3

Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.01989944. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X5 and any other independent variable.VIF = 1/(1-Rsquared). VIF will also be towards 1.

The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.

From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that the variance might be slightly decreasing.

Adjusted R Square analysis

Y ~ X1

multivariateModel1 <- lm(Y ~ X1, data= myData)
summary(multivariateModel1)
## 
## Call:
## lm(formula = Y ~ X1, data = myData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -280.6 -179.8  -30.9  161.6  433.6 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 284.6896    15.1653   18.77 <0.0000000000000002 ***
## X1            8.4741     0.1791   47.31 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 222.2 on 493 degrees of freedom
## Multiple R-squared:  0.8195, Adjusted R-squared:  0.8191 
## F-statistic:  2238 on 1 and 493 DF,  p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept) 
##    284.6896
print(coef(multivariateModel1)[2])
##       X1 
## 8.474147
#Adjusted R Squared

adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.8191429

Y ~ X1 + X2

multivariateModel1 <- lm(Y ~ X1 + X2, data= myData)
summary(multivariateModel1)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = myData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81.22 -35.17 -17.91  48.62  93.08 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -167.1892     5.2146  -32.06 <0.0000000000000002 ***
## X1             8.4422     0.0363  232.54 <0.0000000000000002 ***
## X2           158.5647     1.4781  107.28 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.04 on 492 degrees of freedom
## Multiple R-squared:  0.9926, Adjusted R-squared:  0.9926 
## F-statistic: 3.3e+04 on 2 and 492 DF,  p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept) 
##   -167.1892
print(coef(multivariateModel1)[2])
##       X1 
## 8.442244
#Adjusted R Squared

adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925699

Y ~ X1 + X2 + X3

multivariateModel1 <- lm(Y ~ X1 + X2 + X3, data= myData)
summary(multivariateModel1)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = myData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80.85 -35.51 -18.62  48.59  91.64 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -167.92888    5.53997 -30.312 <0.0000000000000002 ***
## X1             8.44224    0.03634 232.338 <0.0000000000000002 ***
## X2           158.56797    1.47938 107.185 <0.0000000000000002 ***
## X3            39.79333   99.96786   0.398               0.691    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.08 on 491 degrees of freedom
## Multiple R-squared:  0.9926, Adjusted R-squared:  0.9926 
## F-statistic: 2.196e+04 on 3 and 491 DF,  p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept) 
##   -167.9289
print(coef(multivariateModel1)[2])
##       X1 
## 8.442238
#Adjusted R Squared

adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925572

Y ~ X1 + X2 + X3 + X5

multivariateModel1 <- lm(Y ~ X1 + X2 + X3 + X5, data= myData)
summary(multivariateModel1)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X5, data = myData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101.03  -34.66  -18.36   47.65   90.06 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -168.35344    5.50638 -30.574 < 0.0000000000000002 ***
## X1             8.43587    0.03618 233.183 < 0.0000000000000002 ***
## X2           158.43963    1.47058 107.740 < 0.0000000000000002 ***
## X3            54.50974   99.46878   0.548              0.58394    
## X5            99.63186   36.60312   2.722              0.00672 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.79 on 490 degrees of freedom
## Multiple R-squared:  0.9927, Adjusted R-squared:  0.9927 
## F-statistic: 1.669e+04 on 4 and 490 DF,  p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept) 
##   -168.3534
print(coef(multivariateModel1)[2])
##       X1 
## 8.435869
#Adjusted R Squared

adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9926531

Summary of the above Adjusted-R squared analysis:

ars_analysis <- c("Y ~ X1", "Y ~ X1 + X2","Y ~ X1 + X2 + X3", "Y ~ X1 + X2 + X3 + X5")

adj.r.squared <- c(0.8322082, 0.99309, 0.993078, 0.9995959)

data.frame(ars_analysis, adj.r.squared)
##            ars_analysis adj.r.squared
## 1                Y ~ X1     0.8322082
## 2           Y ~ X1 + X2     0.9930900
## 3      Y ~ X1 + X2 + X3     0.9930780
## 4 Y ~ X1 + X2 + X3 + X5     0.9995959

The variables used for analysis

From the analysis above, when we use the 4 variables, adjusted R Squared is the highest. Although adding variable 3 reduced the adjusted r squared value a little bit however. Because of this we will use the first 2 variables for our prediction. Also, the correlation of these variables with Y is high, so we use them.

Now using 2 variables X1 and X2 to predict Yhat. We will accept or reject Hnut. Hnut states that the predicted values Yhat is equal to the original Y values.

multivariateModel_final <- lm(Y ~ X1 + X2, data= myData)
summary(multivariateModel_final)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = myData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81.22 -35.17 -17.91  48.62  93.08 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -167.1892     5.2146  -32.06 <0.0000000000000002 ***
## X1             8.4422     0.0363  232.54 <0.0000000000000002 ***
## X2           158.5647     1.4781  107.28 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.04 on 492 degrees of freedom
## Multiple R-squared:  0.9926, Adjusted R-squared:  0.9926 
## F-statistic: 3.3e+04 on 2 and 492 DF,  p-value: < 0.00000000000000022
print(coef(multivariateModel_final)[1])
## (Intercept) 
##   -167.1892
print(coef(multivariateModel_final)[2])
##       X1 
## 8.442244
print(coef(multivariateModel_final)[3])
##       X2 
## 158.5647
#Standard Error squared
(summary(multivariateModel_final)$sigma)^2
## [1] 2028.76
p_value <- anova(multivariateModel_final)$'Pr(>F)'[1]
p_value
## [1] 0
rSquared <- summary(multivariateModel_final)$r.squared
rSquared
## [1] 0.9926
f_value <- summary(multivariateModel_final)[10]
f_value
## $fstatistic
##    value    numdf    dendf 
## 32997.34     2.00   492.00
adj.r.squared <- summary(multivariateModel_final)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925699
par(mfrow=c(2,2))
plot(multivariateModel_final)

par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Q~Q Plot")
mmf.stdres = rstandard(multivariateModel_final)
qqnorm(mmf.stdres, 
     ylab="Standardized Residuals", 
     xlab="Normal Scores", 
    main=" Q~Q Plot") 
 qqline(mmf.stdres)

plot(multivariateModel_final$residuals, main = "Residuals")

Conclusion

Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.9926. R squared is towards 1. This means we have a good fit.

The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.

From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that the variance is constant.

Since we have a good fit, we fail to reject Hnut. Our failure to reject Hnut is based on the high R squared value and the fact that the variables used tend to give us a good adjusted R squared value.