#Question 2
##The KNN Classifier is used for categorical variables, which means that it assigns the class label for a new data point. KNN Classifier also shows Y as 0 or 1. On the other hand, the KNN regression is used for numerical variables. This method predicts the quantitative value of Y, which can also be continuous.
#Question 9 Part a)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.2
plot(Auto)

#Question 9 Part b)
Auto1<-Auto
Auto1$name=NULL
cor(Auto1)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
#Question 9 Part c)
y1<-lm(mpg~ .-name,data=Auto)
summary(y1)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
#i) There seems to be a relationship in at least one predictor and response variable. This can be noticed when the p-value is less than 0.05 in the F-statistic.
#ii) The predictors that appear to have a statistically significant relationship to the response are: displacement, weights, year and origin.
#iii) The coefficient for the year is 0.750773. This means that with each additional year, the response variable increases by approximately 0.75 units with all the other variables held constant.
#Question 9 Part d)
par(mfrow=c(2,2))
plot(y1)

plot(predict(y1),rstudent(y1))
plot(hatvalues(y1))
which.max(hatvalues(y1))
## 14
## 14

#Question 9 Part d)
## As we can see on the "residuals vs fitted" plot, the plot is U-shaped. This means that there is non-linearity.
## Looking at the QQ plot, there are numerous points on the right tail that are not fitting along the normal distribution. Therefore, we can't assume normality assumption.
## The sqrt standardized residuals plot shows that most of the observations fall inside the range of 0 and 2. This shows that normalit assumption is reasonable.
## The Cook's Distance shows that there are no influential points that are impacting the slope coefficient.
#Question 9 Part e)
model_interaction <- lm(mpg ~ (cylinders + displacement + horsepower + weight + acceleration + year + origin)^2, data = Auto)
summary(model_interaction)
##
## Call:
## lm(formula = mpg ~ (cylinders + displacement + horsepower + weight +
## acceleration + year + origin)^2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
#Question 9 Part e)
## The interactions that appear to be statistically significant are: Displacement:Year, Acceleration:Year, Acceleration:Origin
#Question 9 Part f)
par(mfrow=c(2,2))
y2<-lm(mpg~weight+I((weight)^2),Auto)
summary(y2)
##
## Call:
## lm(formula = mpg ~ weight + I((weight)^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6246 -2.7134 -0.3485 1.8267 16.0866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.226e+01 2.993e+00 20.800 < 2e-16 ***
## weight -1.850e-02 1.972e-03 -9.379 < 2e-16 ***
## I((weight)^2) 1.697e-06 3.059e-07 5.545 5.43e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.176 on 389 degrees of freedom
## Multiple R-squared: 0.7151, Adjusted R-squared: 0.7137
## F-statistic: 488.3 on 2 and 389 DF, p-value: < 2.2e-16
plot(y2)

#Question 9 Part f)
## The QQ plot shows that the regression model does not follow the normal distribution. Furthermore, the Cook's Distance shows that there are no influential points impacting the slope coefficient.
#Question 10 Part a)
carsales<-lm(Sales~Price+Urban+US,data=Carseats)
summary(carsales)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#Question 10 Part b)
## Price: For each $1 increase in price, the expected sales decrease by 0.0545 units when all the other variables are held constant.
## UrbanYes: The coefficient (-0.0219) is close to 0, which means that being in urban location does not have a statistically significant impact on sales.
## USYes: US stores sell approximately 1.2 more units than stores outside the US, when price and Urban status is held constant.
#Question 10 Part c)
## Sales = 13.043−0.05446(Price)−0.02192(UrbanYes)+1.20057(USYes)
#Question 10 Part d)
## The null hypothesis can be rejected for Price and UrbanYes because they have p-values that are less than 0.05.
#Question 10 Part e)
carsales_var<-lm(Sales~Price+US,data=Carseats)
summary(carsales_var)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
#Question 10 Part f)
anova(carsales,carsales_var)
## Analysis of Variance Table
##
## Model 1: Sales ~ Price + Urban + US
## Model 2: Sales ~ Price + US
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 396 2420.8
## 2 397 2420.9 -1 -0.03979 0.0065 0.9357
#Question 10 Part f)
## After removing the urban variable, there was only a slight increase in adjusted r square and slight decrease in residual standard error. Furthermore, the Anova test shows that this difference is not statistically significant. Therefore, we fail to reject the null hypothesis.
#Question 10 Part g)
confint(carsales_var)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
#Question 10 Part h)
par(mfrow=c(2,2))
plot(predict(carsales_var),rstudent(carsales_var))
leverage<-hat(model.matrix(carsales_var))
plot(leverage)
4/nrow(Carseats)
## [1] 0.01
plot(Carseats$Sales,Carseats$Price)
points(Carseats[leverage>0.01,]$Sales,Carseats[leverage>0.01,]$Price,col='blue')

#Question 10 Part h)
## Looking at the rstudent residuals vs predicted value plot, we can see that there are no presence of outliers. This is because the residuals are within the range of -3 to 3. Therefore, we can conclude that there are no evidence of outliers or high-leverage observations.
#Question 12 Part a)
## In order for the coefficient estimate for the regression of X onto Y to be the same as the coefficient estimate for the regression of Y onto X, the variables X and Y have to be perfectly linearly related.
#Question 12 Part b)
x=rnorm(100)
y=rbinom(100,2,0.3)
example<-lm(y~x+0)
summary(example)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36756 0.04461 0.76948 1.07762 2.19825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 0.18166 0.08967 2.026 0.0455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9239 on 99 degrees of freedom
## Multiple R-squared: 0.03981, Adjusted R-squared: 0.03011
## F-statistic: 4.104 on 1 and 99 DF, p-value: 0.04547
example1<-lm(x~y+0)
summary(example1)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.22555 -0.81805 -0.03483 0.71855 2.26208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.2191 0.1082 2.026 0.0455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.015 on 99 degrees of freedom
## Multiple R-squared: 0.03981, Adjusted R-squared: 0.03011
## F-statistic: 4.104 on 1 and 99 DF, p-value: 0.04547
#Question 12 Part c)
x=1:100
y=100:1
example2<-lm(y~x+0)
summary(example2)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 0.5075 0.0866 5.86 6.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 6.094e-08
example3<-lm(x~y+0)
summary(example3)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.5075 0.0866 5.86 6.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 6.094e-08