myData <- read.csv("/Users/ruthokoilu/Documents/Fall 17/ CSC591/Project2/OKOILU RUTH.csv")
head(myData)
## X1 X2 X3 X4 X5 Y
## 1 -1.01261 3 0.001361302 504 1.877810e-55 239.0384
## 2 155.18308 5 0.019201820 1400 1.589644e-08 1980.3923
## 3 90.64515 4 0.010076991 896 5.526546e-04 1193.7046
## 4 74.27032 3 0.005263104 504 2.403234e-25 886.8339
## 5 39.44549 3 0.003669509 504 4.761983e-05 605.3050
## 6 51.14948 2 0.001775451 224 6.342782e-12 532.1127
summary(myData)
## X1 X2 X3 X4
## Min. :-145.04 Min. :1.000 Min. :1.887e-05 Min. : 56.0
## 1st Qu.: 27.53 1st Qu.:2.000 1st Qu.:4.651e-03 1st Qu.: 224.0
## Median : 65.41 Median :3.000 Median :1.208e-02 Median : 504.0
## Mean : 63.29 Mean :2.852 Mean :1.827e-02 Mean : 560.9
## 3rd Qu.: 99.82 3rd Qu.:4.000 3rd Qu.:2.474e-02 3rd Qu.: 896.0
## Max. : 229.08 Max. :5.000 Max. :1.518e-01 Max. :1400.0
## X5 Y
## Min. :0.0000000 Min. :-1188.6
## 1st Qu.:0.0000000 1st Qu.: 465.5
## Median :0.0000000 Median : 809.3
## Mean :0.0092162 Mean : 819.6
## 3rd Qu.:0.0000204 3rd Qu.: 1182.9
## Max. :0.8897839 Max. : 2179.2
library(ggplot2)
par(mfrow=c(1,1))
#qplot(myData[, 1], geom="histogram")
qplot(myData[, 1],
geom="histogram",
binwidth = 5,
main = "Histogram for Column X1",
xlab = "X1",
fill=I("blue"),
col=I("red"),
alpha=I(.2),
xlim=c(20,50))
## Warning: Removed 422 rows containing non-finite values (stat_bin).
qplot(myData[, 2],
geom="histogram",
binwidth = 1,
main = "Histogram for Column X2",
xlab = "X2",
fill=I("blue"),
col=I("red"),
alpha=I(.2))
qplot(myData[, 3],
geom="histogram",
binwidth = 0.005,
main = "Histogram for Column X3",
xlab = "X3",
fill=I("blue"),
col=I("red"),
alpha=I(.2))
qplot(myData[, 4],
geom="histogram",
binwidth = 30,
main = "Histogram for Column X4",
xlab = "X4",
fill=I("blue"),
col=I("red"),
alpha=I(.2))
qplot(myData[, 5],
geom="histogram",
binwidth = 0.01,
main = "Histogram for Column X5",
xlab = "X5",
fill=I("blue"),
col=I("red"),
alpha=I(.2))
qplot(myData[, 6],
geom="histogram",
binwidth = 30,
main = "Histogram for Column Y",
xlab = "Y",
fill=I("blue"),
col=I("red"),
alpha=I(.2))
Histogram for Column Y seems to be normally distributed.
#as.data.frame(table(myData[, 1]))
as.data.frame(table(myData[, 2])) #can make factor
## Var1 Freq
## 1 1 116
## 2 2 92
## 3 3 115
## 4 4 104
## 5 5 73
#as.data.frame(table(myData[, 3]))
as.data.frame(table(myData[, 4])) #Can make factor
## Var1 Freq
## 1 56 116
## 2 224 92
## 3 504 115
## 4 896 104
## 5 1400 73
#as.data.frame(table(myData[, 5]))
#as.data.frame(table(myData[, 6]))
options(scipen=9999)
Columns <- c("X1", "X2", "X3", "X4", "X5","Y")
mean_1 <- mean(myData[, 1])
mean_2 <- mean(myData[, 2])
mean_3 <- mean(myData[, 3])
mean_4 <- mean(myData[, 4])
mean_5 <- mean(myData[, 5])
mean_6 <- mean(myData[, 6])
Mean <- c(mean_1, mean_2, mean_3, mean_4, mean_5, mean_6)
all_means <- data.frame(Columns, Mean)
all_means
## Columns Mean
## 1 X1 63.289457093
## 2 X2 2.852000000
## 3 X3 0.018268310
## 4 X4 560.896000000
## 5 X5 0.009216197
## 6 Y 819.612549513
Columns <- c("X1", "X2", "X3", "X4", "X5","Y")
var_1 <- var(myData[, 1])
var_2 <- var(myData[, 2])
var_3 <- var(myData[, 3])
var_4 <- var(myData[, 4])
var_5 <- var(myData[, 5])
var_6 <- var(myData[, 6])
Variance <- c(var_1, var_2, var_3, var_4, var_5, var_6)
all_vars <- data.frame(Columns, Variance)
all_vars
## Columns Variance
## 1 X1 3364.7627563464
## 2 X2 1.8858677355
## 3 X3 0.0004089895
## 4 X4 207339.7005851703
## 5 X5 0.0030261775
## 6 Y 293780.4107939845
boxplot(myData[, 1], main = "Histogram for Column X1")
boxplot(myData[, 2], main = "Histogram for Column X2")
boxplot(myData[, 3], main = "Histogram for Column X3")
boxplot(myData[, 4], main = "Histogram for Column X4")
boxplot(myData[, 5], main = "Histogram for Column X5")
boxplot(myData[, 6], main = "Histogram for Column Y")
X2 has no outlier while X4 has a few.
removeOutliers <- function(newData){
interQValues<- boxplot.stats(newData[,1])
interQValues$stats[1]
interQValues$stats[5]
newData2 <- subset(newData, newData[,1]>interQValues$stats[1] & newData[,1]<interQValues$stats[5])
return(newData2)
}
par(mfrow=c(1,2))
boxplot(myData[, 1], main="Before removing outliers")
#new_X1 <- removeOutliers(myData[,1])
new_X1 <- removeOutliers(myData)
boxplot(new_X1[,1], main="After removing outliers")
myData<-new_X1
cor(myData)
## X1 X2 X3 X4 X5
## X1 1.0000000000 0.008191419 0.0003856156 0.020274771 0.06479151
## X2 0.0081914191 1.000000000 -0.0055883036 0.978999078 0.03277857
## X3 0.0003856156 -0.005588304 1.0000000000 -0.003600965 -0.05436637
## X4 0.0202747709 0.978999078 -0.0036009650 1.000000000 0.05516787
## X5 0.0647915069 0.032778570 -0.0543663701 0.055167871 1.00000000
## Y 0.9052673858 0.423443509 -0.0004322085 0.442592994 0.08244086
## Y
## X1 0.9052673858
## X2 0.4234435093
## X3 -0.0004322085
## X4 0.4425929943
## X5 0.0824408552
## Y 1.0000000000
linearModel <- lm(Y ~ X1, data= myData)
summary(linearModel)
##
## Call:
## lm(formula = Y ~ X1, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -280.6 -179.8 -30.9 161.6 433.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.6896 15.1653 18.77 <0.0000000000000002 ***
## X1 8.4741 0.1791 47.31 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 222.2 on 493 degrees of freedom
## Multiple R-squared: 0.8195, Adjusted R-squared: 0.8191
## F-statistic: 2238 on 1 and 493 DF, p-value: < 0.00000000000000022
From the summary of the linear model above, we can see the details of the model created. The summary shows the statistics (Min, Max, Median, Interquatile values) of the residuals of the analysis. The residual is simply the error (Actual Y values (Y) - Predicted values of Y (Yhat) ) Residual\[ε hat = Y_i - Yhat_i\] # 2.1 To determine the values for a0, a1, and s2. The summary also shows the parameters of the model: Y = a0 + a1X + ε It can be seen that:
# What are the coefficients a0 and a1?
#To print ot the coefficients
print(coef(linearModel)[1])
## (Intercept)
## 284.6896
print(coef(linearModel)[2])
## X1
## 8.474147
a0 is 284.6896
a1 is 8.474147
The residual standard error is 222.2219 And σ2 is 49382.59
sigmaSd <- summary(linearModel)$sigma
sigmaSd^2
## [1] 49382.59
The residual variance S squared is shown above
The p-value is
p_value <- anova(linearModel)$'Pr(>F)'[1]
p_value
## [1] 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002063461
The (Coefficient of determination) R2 is 0.819509. R squared goes from 0 to 1. If R2 is towards 0, it means bad fit but it’s towards 1, we have a good fit. It goes towards 1. This means we have a good fit.
rSquared <- summary(linearModel)$r.squared
rSquared
## [1] 0.819509
The F value shows us if we should reject or accept H0. If F > F critical value, then we reject H0 else we accept H0. F statistics is 2238.439 on 1.000 and 493.000 degrees of freedom.
f_value <- summary(linearModel)[10]
f_value
## $fstatistic
## value numdf dendf
## 2238.439 1.000 493.000
The adjusted R squared is used in feature selection to tell us what independent variables improve our model. The adjusted R squared value increases as important variables are added.
adj.r.squared <- summary(linearModel)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.8191429
By looking at it, regression line looks like a good fit.
The plot above shows that the residuals are not really normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.
From the above plot, we can see that the residuals are random. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.
##
## Call:
## lm(formula = Y ~ X1 + X.squared, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -282.18 -179.30 -31.03 161.58 433.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.798920 15.260126 18.66 <0.0000000000000002 ***
## X1 8.453078 0.349348 24.20 <0.0000000000000002 ***
## X.squared 0.000172 0.002448 0.07 0.944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 222.4 on 492 degrees of freedom
## Multiple R-squared: 0.8195, Adjusted R-squared: 0.8188
## F-statistic: 1117 on 2 and 492 DF, p-value: < 0.00000000000000022
## (Intercept)
## 284.7989
## X1
## 8.453078
## [1] 222.4466
## [1] 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004852096
## [1] 0.8195109
## $fstatistic
## value numdf dendf
## 1116.963 2.000 492.000
## $adj.r.squared
## [1] 0.8187772
## [1] 0.8195109
a0 becomes 284.7989 and a1 is 8.453078, the p-value that was doubled and the f-value decreased by half. R squared is 0.8195109. This rsquared compared to the reqsuared of the previous analysis which was 0.819509, shows that using a polynomial in our analysis does not improve the fit.
multivariateModel <- lm(X1 ~ X2 + X3 + X5 , data= myData)
summary(multivariateModel)
##
## Call:
## lm(formula = X1 ~ X2 + X3 + X5, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.602 -35.789 1.345 35.918 130.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.1940 6.2694 9.920 <0.0000000000000002 ***
## X2 0.2479 1.8345 0.135 0.893
## X3 10.8480 124.0824 0.087 0.930
## X5 65.4402 45.5653 1.436 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.87 on 491 degrees of freedom
## Multiple R-squared: 0.00425, Adjusted R-squared: -0.001834
## F-statistic: 0.6986 on 3 and 491 DF, p-value: 0.5532
# Coefficients are:
print(coef(multivariateModel )[1])
## (Intercept)
## 62.19397
print(coef(multivariateModel)[2])
## X2
## 0.2479108
print(coef(multivariateModel)[3])
## X3
## 10.84797
print(coef(multivariateModel)[4])
## X5
## 65.44023
# σ2 is
multSigmasd <- summary(multivariateModel)$sigma
#Standard Error
summary(multivariateModel)$sigma
## [1] 55.87247
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.8557389
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.004250296
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
## value numdf dendf
## 0.698601 3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.001833715
par(mfrow=c(2,2))
plot(multivariateModel)
par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres,
ylab="Standardized Residuals",
xlab="Normal Scores",
main="Q~Q Plot")
qqline(mmf.stdres)
plot(multivariateModel$residuals, main = "Residuals")
Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.006478916. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X1 and any other independent variable. VIF = 1/(1-Rsquared). VIF will also be towards 1.
The Q-Q plot above shows that the residuals is normally distributed. This means that the PDF of residuals is identical to that of a standard normal distribution because we can see from the plot that left and right tail follows the 45 degree straight line.
From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.
multivariateModel <- lm(X2 ~ X1 + X3 + X5 , data= myData)
summary(multivariateModel)
##
## Call:
## lm(formula = X2 ~ X1 + X3 + X5, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1830 -0.8670 0.1399 1.1414 2.1747
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.85041 0.10958 26.013 <0.0000000000000002 ***
## X1 0.00015 0.00111 0.135 0.893
## X3 -0.25959 3.05250 -0.085 0.932
## X5 0.79799 1.12271 0.711 0.478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.374 on 491 degrees of freedom
## Multiple R-squared: 0.001126, Adjusted R-squared: -0.004977
## F-statistic: 0.1845 on 3 and 491 DF, p-value: 0.9069
print(coef(multivariateModel )[1])
## (Intercept)
## 2.850405
print(coef(multivariateModel)[2])
## X1
## 0.0001500324
print(coef(multivariateModel)[3])
## X3
## -0.2595929
#Standard Error squared
(summary(multivariateModel)$sigma)^2
## [1] 1.889232
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.8559622
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.001126119
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
## value numdf dendf
## 0.184516 3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.00497698
par(mfrow=c(2,2))
plot(multivariateModel)
par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres,
ylab="Standardized Residuals",
xlab="Normal Scores",
main="Q~Q Plot")
qqline(mmf.stdres)
plot(multivariateModel$residuals, main = "Residuals")
Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.9590294. R squared is towards 1. This means we have a good fit and there is multicollinearity between X2 and another independent variable.VIF = 1/(1-Rsquared). VIF will also be towards infinity.
The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.
From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.
multivariateModel <- lm(X3 ~ X1+ X2 + X5 , data= myData)
summary(multivariateModel)
##
## Call:
## lm(formula = X3 ~ X1 + X2 + X5, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.018591 -0.013580 -0.006149 0.006425 0.133217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.018618470 0.002352726 7.914 0.0000000000000167 ***
## X1 0.000001435 0.000016414 0.087 0.930
## X2 -0.000056741 0.000667201 -0.085 0.932
## X5 -0.020001668 0.016582427 -1.206 0.228
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02032 on 491 degrees of freedom
## Multiple R-squared: 0.002986, Adjusted R-squared: -0.003106
## F-statistic: 0.4901 on 3 and 491 DF, p-value: 0.6893
print(coef(multivariateModel)[1])
## (Intercept)
## 0.01861847
print(coef(multivariateModel)[2])
## X1
## 0.00000143496
print(coef(multivariateModel)[3])
## X2
## -0.00005674068
#Standard Error
summary(multivariateModel)$sigma
## [1] 0.02032093
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.9931757
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.002985726
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
## value numdf dendf
## 0.4901271 3.0000000 491.0000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] -0.003106011
par(mfrow=c(2,2))
plot(multivariateModel)
par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres,
ylab="Standardized Residuals",
xlab="Normal Scores",
main=" Q~Q Plot")
qqline(mmf.stdres)
plot(multivariateModel$residuals, main = "Residuals")
Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.003050009. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X3 and any other independent variable. VIF = 1/(1-Rsquared). VIF will also be towards 1.
The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.
From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that they have a constant variance.
multivariateModel <- lm(X5 ~ X1 + X2 + X3, data= myData)
summary(multivariateModel)
##
## Call:
## lm(formula = X5 ~ X1 + X2 + X3, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.02072 -0.01217 -0.00851 -0.00470 0.87361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.00426134 0.00678629 0.628 0.530
## X1 0.00006393 0.00004451 1.436 0.152
## X2 0.00128806 0.00181220 0.711 0.478
## X3 -0.14770784 0.12245751 -1.206 0.228
##
## Residual standard error: 0.05522 on 491 degrees of freedom
## Multiple R-squared: 0.008177, Adjusted R-squared: 0.002117
## F-statistic: 1.349 on 3 and 491 DF, p-value: 0.2577
print(coef(multivariateModel )[1])
## (Intercept)
## 0.004261344
print(coef(multivariateModel)[2])
## X1
## 0.00006392538
print(coef(multivariateModel)[3])
## X2
## 0.001288059
print(coef(multivariateModel)[4])
## X3
## -0.1477078
#Standard Error squared
(summary(multivariateModel)$sigma)^2
## [1] 0.00304947
p_value <- anova(multivariateModel)$'Pr(>F)'[1]
p_value
## [1] 0.1500556
rSquared <- summary(multivariateModel)$r.squared
rSquared
## [1] 0.00817686
f_value <- summary(multivariateModel)[10]
f_value
## $fstatistic
## value numdf dendf
## 1.349313 3.000000 491.000000
adj.r.squared <- summary(multivariateModel)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.002116841
par(mfrow=c(2,2))
plot(multivariateModel)
par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Fig.8: Q~Q Plot")
mmf.stdres = rstandard(multivariateModel)
qqnorm(mmf.stdres,
ylab="Standardized Residuals",
xlab="Normal Scores",
main=" Q~Q Plot")
qqline(mmf.stdres)
plot(multivariateModel$residuals, main = "Residuals")
Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.01989944. R squared is towards 0. This means we have a bad fit and there is no multicollinearity between X5 and any other independent variable.VIF = 1/(1-Rsquared). VIF will also be towards 1.
The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.
From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that the variance might be slightly decreasing.
Y ~ X1
multivariateModel1 <- lm(Y ~ X1, data= myData)
summary(multivariateModel1)
##
## Call:
## lm(formula = Y ~ X1, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -280.6 -179.8 -30.9 161.6 433.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.6896 15.1653 18.77 <0.0000000000000002 ***
## X1 8.4741 0.1791 47.31 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 222.2 on 493 degrees of freedom
## Multiple R-squared: 0.8195, Adjusted R-squared: 0.8191
## F-statistic: 2238 on 1 and 493 DF, p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept)
## 284.6896
print(coef(multivariateModel1)[2])
## X1
## 8.474147
#Adjusted R Squared
adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.8191429
Y ~ X1 + X2
multivariateModel1 <- lm(Y ~ X1 + X2, data= myData)
summary(multivariateModel1)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.22 -35.17 -17.91 48.62 93.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -167.1892 5.2146 -32.06 <0.0000000000000002 ***
## X1 8.4422 0.0363 232.54 <0.0000000000000002 ***
## X2 158.5647 1.4781 107.28 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.04 on 492 degrees of freedom
## Multiple R-squared: 0.9926, Adjusted R-squared: 0.9926
## F-statistic: 3.3e+04 on 2 and 492 DF, p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept)
## -167.1892
print(coef(multivariateModel1)[2])
## X1
## 8.442244
#Adjusted R Squared
adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925699
Y ~ X1 + X2 + X3
multivariateModel1 <- lm(Y ~ X1 + X2 + X3, data= myData)
summary(multivariateModel1)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.85 -35.51 -18.62 48.59 91.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -167.92888 5.53997 -30.312 <0.0000000000000002 ***
## X1 8.44224 0.03634 232.338 <0.0000000000000002 ***
## X2 158.56797 1.47938 107.185 <0.0000000000000002 ***
## X3 39.79333 99.96786 0.398 0.691
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.08 on 491 degrees of freedom
## Multiple R-squared: 0.9926, Adjusted R-squared: 0.9926
## F-statistic: 2.196e+04 on 3 and 491 DF, p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept)
## -167.9289
print(coef(multivariateModel1)[2])
## X1
## 8.442238
#Adjusted R Squared
adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925572
Y ~ X1 + X2 + X3 + X5
multivariateModel1 <- lm(Y ~ X1 + X2 + X3 + X5, data= myData)
summary(multivariateModel1)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X5, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.03 -34.66 -18.36 47.65 90.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -168.35344 5.50638 -30.574 < 0.0000000000000002 ***
## X1 8.43587 0.03618 233.183 < 0.0000000000000002 ***
## X2 158.43963 1.47058 107.740 < 0.0000000000000002 ***
## X3 54.50974 99.46878 0.548 0.58394
## X5 99.63186 36.60312 2.722 0.00672 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.79 on 490 degrees of freedom
## Multiple R-squared: 0.9927, Adjusted R-squared: 0.9927
## F-statistic: 1.669e+04 on 4 and 490 DF, p-value: < 0.00000000000000022
print(coef(multivariateModel1)[1])
## (Intercept)
## -168.3534
print(coef(multivariateModel1)[2])
## X1
## 8.435869
#Adjusted R Squared
adj.r.squared <- summary(multivariateModel1)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9926531
ars_analysis <- c("Y ~ X1", "Y ~ X1 + X2","Y ~ X1 + X2 + X3", "Y ~ X1 + X2 + X3 + X5")
adj.r.squared <- c(0.8322082, 0.99309, 0.993078, 0.9995959)
data.frame(ars_analysis, adj.r.squared)
## ars_analysis adj.r.squared
## 1 Y ~ X1 0.8322082
## 2 Y ~ X1 + X2 0.9930900
## 3 Y ~ X1 + X2 + X3 0.9930780
## 4 Y ~ X1 + X2 + X3 + X5 0.9995959
From the analysis above, when we use the 4 variables, adjusted R Squared is the highest. Although adding variable 3 reduced the adjusted r squared value a little bit however. Because of this we will use the first 2 variables for our prediction. Also, the correlation of these variables with Y is high, so we use them.
Now using 2 variables X1 and X2 to predict Yhat. We will accept or reject Hnut. Hnut states that the predicted values Yhat is equal to the original Y values.
multivariateModel_final <- lm(Y ~ X1 + X2, data= myData)
summary(multivariateModel_final)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = myData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.22 -35.17 -17.91 48.62 93.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -167.1892 5.2146 -32.06 <0.0000000000000002 ***
## X1 8.4422 0.0363 232.54 <0.0000000000000002 ***
## X2 158.5647 1.4781 107.28 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.04 on 492 degrees of freedom
## Multiple R-squared: 0.9926, Adjusted R-squared: 0.9926
## F-statistic: 3.3e+04 on 2 and 492 DF, p-value: < 0.00000000000000022
print(coef(multivariateModel_final)[1])
## (Intercept)
## -167.1892
print(coef(multivariateModel_final)[2])
## X1
## 8.442244
print(coef(multivariateModel_final)[3])
## X2
## 158.5647
#Standard Error squared
(summary(multivariateModel_final)$sigma)^2
## [1] 2028.76
p_value <- anova(multivariateModel_final)$'Pr(>F)'[1]
p_value
## [1] 0
rSquared <- summary(multivariateModel_final)$r.squared
rSquared
## [1] 0.9926
f_value <- summary(multivariateModel_final)[10]
f_value
## $fstatistic
## value numdf dendf
## 32997.34 2.00 492.00
adj.r.squared <- summary(multivariateModel_final)[9]
adj.r.squared
## $adj.r.squared
## [1] 0.9925699
par(mfrow=c(2,2))
plot(multivariateModel_final)
par(mfrow=c(1,1))
#plot(multivariateModel, which = 2, main = "Q~Q Plot")
mmf.stdres = rstandard(multivariateModel_final)
qqnorm(mmf.stdres,
ylab="Standardized Residuals",
xlab="Normal Scores",
main=" Q~Q Plot")
qqline(mmf.stdres)
plot(multivariateModel_final$residuals, main = "Residuals")
Each intercept and coefficients are shown above The (Coefficient of determination) R2 is 0.9926. R squared is towards 1. This means we have a good fit.
The Q-Q plot above shows that the residuals is not normally distributed. This means that the PDF of residuals is not identical to that of a standard normal distribution because we can see from the plot that left and right tail deviates from the 45 degree straight line.
From the above plot, we can see that the residuals are randomly distributed. They are also neither postively nor negatively correlated. It can also be observered that the variance is constant.
Since we have a good fit, we fail to reject Hnut. Our failure to reject Hnut is based on the high R squared value and the fact that the variables used tend to give us a good adjusted R squared value.
1.4 Comment on the results.