Number 2

#The difference between the KNN classifier and the KNN regression methods the classifier uses a categorical independent variable and the regression uses a continuous independent variable.
## Number 9

library(ISLR2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("Auto")

Part A

pairs(Auto)

## Part B

cor(Auto[, c(1:8)])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

Part C

options(scipen = 999)
linearmodel <- lm(data=Auto[,1:8], mpg ~.)
summary(linearmodel)

## 
## Call:
## lm(formula = mpg ~ ., data = Auto[, 1:8])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707              0.00024 ***
## cylinders     -0.493376   0.323282  -1.526              0.12780    
## displacement   0.019896   0.007515   2.647              0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230              0.21963    
## weight        -0.006474   0.000652  -9.929 < 0.0000000000000002 ***
## acceleration   0.080576   0.098845   0.815              0.41548    
## year           0.750773   0.050973  14.729 < 0.0000000000000002 ***
## origin         1.426141   0.278136   5.127          0.000000467 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 0.00000000000000022

# i. There is a relationship between the predictors and the response variable, mpg. The .8 r-squared value tells us that the model is a pretty good fit for analysis. 
# ii.From the linear model, we can see there is a significant relationship between a couple of variables and mpg. These include displacement, weight, year, and origin. However, origin may need to be changed to a factor variable. This is shown by the P value.  
#iii. The coefficient for the year variable tells us there is a weak positive relationship between year and mpg.

Part D

par(mfrow = c(2,2))
plot(linearmodel)

# The residual plot suggests to us there are a few outliers (323,326,327), but not many. The residual vs. leverage plot shows us that there are some points with high leverage, but they are all almost within the cooks distance line.

Part E

linearmodel <- lm(formula = mpg ~ cylinders * cylinders + weight * weight + horsepower * year + horsepower * weight, data = Auto[, 1:8])
summary(linearmodel)

## 
## Call:
## lm(formula = mpg ~ cylinders * cylinders + weight * weight + 
##     horsepower * year + horsepower * weight, data = Auto[, 1:8])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3390 -1.7208 -0.0183  1.3665 11.9425 
## 
## Coefficients:
##                        Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)       -51.007282504  10.674370551  -4.778 0.000002516645302063 ***
## cylinders           0.158219457   0.203707623   0.777              0.43781    
## weight             -0.010459504   0.000703948 -14.858 < 0.0000000000000002 ***
## horsepower          0.336483446   0.104778631   3.211              0.00143 ** 
## year                1.449502772   0.130802655  11.082 < 0.0000000000000002 ***
## horsepower:year    -0.007114799   0.001302777  -5.461 0.000000084966691387 ***
## weight:horsepower   0.000045693   0.000005365   8.517 0.000000000000000373 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.861 on 385 degrees of freedom
## Multiple R-squared:  0.8677, Adjusted R-squared:  0.8656 
## F-statistic: 420.7 on 6 and 385 DF,  p-value: < 0.00000000000000022

## There are several interactions that are significant. The only interaction that was not significant was the cylinders ^2.

Part f

par(mfrow = c(1,2))
plot((Auto$acceleration)^2, Auto$mpg)
plot(log(Auto$horsepower), Auto$mpg)

# The log transformation between horsepower and auto appears to be more linear.

Number 10

Loading data

data("Carseats")
str(Carseats)

## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Part A

model <- lm(data=Carseats, Sales ~ Price + Urban + US)
summary(model)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036 < 0.0000000000000002 ***
## Price       -0.054459   0.005242 -10.389 < 0.0000000000000002 ***
## UrbanYes    -0.021916   0.271650  -0.081                0.936    
## USYes        1.200573   0.259042   4.635           0.00000486 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 0.00000000000000022

Part B

# At least one slope is different from another. The low p values in price amd USYes coefficient shows that there is  significance. There is no significance between yes/no values in the "urban" variable and sales. Our R-Squared is low, which tells us that the model won't predict the best as of now.

Part C

#Sales = 13.04 - .05(Price) -.02(Urban) + 1.2 (USYes)

Part D

You can reject the null hypothesis for price and USYes because the p-value is less than alpha, .05.

Part E

model <- lm(data=Carseats, Sales ~ Price + US)
summary(model)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652 < 0.0000000000000002 ***
## Price       -0.05448    0.00523 -10.416 < 0.0000000000000002 ***
## USYes        1.19964    0.25846   4.641           0.00000471 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 0.00000000000000022

Part F

The R-Square is still not very good. There about 23.9% of the variance can be explained by the two dependent variables.

Part G

confint(model)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

Part H

par(mfrow = c(2,2))
plot(model)

# Based on the fitted plot, there is a relatively even spread of variables that have a fitted value between 6 and 10. The Quantiles plot looks good. However, there is a high leverage between 0 and .01.

Question 12

##Part A

The coefficient regression of x to y is the same from y to x when x^2 = y^2.

Part B

x <- 1:100
sum(x^2)

## [1] 338350

y <- 2 * x + rnorm(100, sd = .01)
sum(y^2)

## [1] 1353461

fitx <- lm(y~x+0)
fity <- lm(x~y+0)
summary(fity)

## 
## Call:
## lm(formula = x ~ y + 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.013628 -0.003679 -0.000283  0.002634  0.015427 
## 
## Coefficients:
##      Estimate  Std. Error t value            Pr(>|t|)    
## y 0.499988722 0.000004447  112428 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.005174 on 99 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.264e+10 on 1 and 99 DF,  p-value: < 0.00000000000000022

summary(fitx)

## 
## Call:
## lm(formula = y ~ x + 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.030853 -0.005267  0.000567  0.007359  0.027258 
## 
## Coefficients:
##     Estimate Std. Error t value            Pr(>|t|)    
## x 2.00004510 0.00001779  112428 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01035 on 99 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.264e+10 on 1 and 99 DF,  p-value: < 0.00000000000000022

Part C

x <- 1:100
sum(x^2)

## [1] 338350

y <- 100:1
sum(y^2)

## [1] 338350

fitx <- lm(y~x+0)
fity <- lm(x~y+0)
summary(fity)

## 
## Call:
## lm(formula = x ~ y + 0)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.75 -12.44  24.87  62.18  99.49 
## 
## Coefficients:
##   Estimate Std. Error t value     Pr(>|t|)    
## y   0.5075     0.0866    5.86 0.0000000609 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared:  0.2575, Adjusted R-squared:   0.25 
## F-statistic: 34.34 on 1 and 99 DF,  p-value: 0.00000006094

summary(fitx)

## 
## Call:
## lm(formula = y ~ x + 0)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.75 -12.44  24.87  62.18  99.49 
## 
## Coefficients:
##   Estimate Std. Error t value     Pr(>|t|)    
## x   0.5075     0.0866    5.86 0.0000000609 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared:  0.2575, Adjusted R-squared:   0.25 
## F-statistic: 34.34 on 1 and 99 DF,  p-value: 0.00000006094

MS 4373 Assignment 2

Mia Brito

10/4/2022

Number 2

Part A

Part C

Part D

Part E

Part f

Number 10

Loading data

Part A

Part B

Part C

Part D

You can reject the null hypothesis for price and USYes because the p-value is less than alpha, .05.

Part E

Part F

The R-Square is still not very good. There about 23.9% of the variance can be explained by the two dependent variables.

Part G

Part H

Question 12

The coefficient regression of x to y is the same from y to x when x^2 = y^2.

Part B

Part C