#Greeshma Ganji
#ISTE 780
#Summer 2023
#PART-I
Auto<-read.csv("/Users/greeshmaganji/RIT/ISTE780/Lab2/Auto.csv")
#View(Auto)
#names(Auto)
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Length:397
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.0 Class :character
## Median :23.00 Median :4.000 Median :146.0 Mode :character
## Mean :23.52 Mean :5.458 Mean :193.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0
## Max. :46.60 Max. :8.000 Max. :455.0
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2223 1st Qu.:13.80 1st Qu.:73.00 1st Qu.:1.000
## Median :2800 Median :15.50 Median :76.00 Median :1.000
## Mean :2970 Mean :15.56 Mean :75.99 Mean :1.574
## 3rd Qu.:3609 3rd Qu.:17.10 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
## name
## Length:397
## Class :character
## Mode :character
##
##
##
Auto[,9] = as.numeric(factor(Auto[,9]))
Auto[,4] = as.numeric(factor(Auto[,4]))
#1a) scatterplot which includes all variables of data
pairs(Auto)

#View(Auto)
#1b) matrix of correlations between the variables using the function cor()
cor(subset(Auto, select = -name))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7762599 -0.8044430 0.4228227 -0.8317389
## cylinders -0.7762599 1.0000000 0.9509199 -0.5466585 0.8970169
## displacement -0.8044430 0.9509199 1.0000000 -0.4820705 0.9331044
## horsepower 0.4228227 -0.5466585 -0.4820705 1.0000000 -0.4821507
## weight -0.8317389 0.8970169 0.9331044 -0.4821507 1.0000000
## acceleration 0.4222974 -0.5040606 -0.5441618 0.2662877 -0.4195023
## year 0.5814695 -0.3467172 -0.3698041 0.1274167 -0.3079004
## origin 0.5636979 -0.5649716 -0.6106643 0.2973734 -0.5812652
## acceleration year origin
## mpg 0.4222974 0.5814695 0.5636979
## cylinders -0.5040606 -0.3467172 -0.5649716
## displacement -0.5441618 -0.3698041 -0.6106643
## horsepower 0.2662877 0.1274167 0.2973734
## weight -0.4195023 -0.3079004 -0.5812652
## acceleration 1.0000000 0.2829009 0.2100836
## year 0.2829009 1.0000000 0.1843141
## origin 0.2100836 0.1843141 1.0000000
# 1c) multiple linear regression with mpg as the response and all other variables except name as the predictors
Auto_fit_1 <- lm(mpg ~ . - name, data = Auto)
summary(Auto_fit_1)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.629 -2.034 -0.046 1.801 13.010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.128e+01 4.259e+00 -4.998 8.78e-07 ***
## cylinders -2.927e-01 3.382e-01 -0.865 0.3874
## displacement 1.603e-02 7.284e-03 2.201 0.0283 *
## horsepower 7.942e-03 6.809e-03 1.166 0.2442
## weight -6.870e-03 5.799e-04 -11.846 < 2e-16 ***
## acceleration 1.539e-01 7.750e-02 1.986 0.0477 *
## year 7.734e-01 4.939e-02 15.661 < 2e-16 ***
## origin 1.346e+00 2.691e-01 5.004 8.52e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.331 on 389 degrees of freedom
## Multiple R-squared: 0.822, Adjusted R-squared: 0.8188
## F-statistic: 256.7 on 7 and 389 DF, p-value: < 2.2e-16
# (i) Yes, there is a relationship between the predictors and the response.
# R-squared value of 0.822 indicates there is 82.2% variance in mpg.
# F-statistic of 256.7 also indicates that the relationship between the predictors and the response.
# (ii) displacement, weight, acceleration, year appear to have a statistically
# significant relationship to the response as p-values are less than 0.05
# (iii)coefficient for the year variable suggest that the average effect of an increase of 1 year is an increase of 7.734 in “mpg”
# cars become more fuel efficient over time.
# 1d) plot() function to produce diagnostic plots of the linear regression fit.
par(mfrow = c(2,2))
plot(Auto_fit_1)

# There are few outliers in the plot, which are unusual(values such as 323, 320, 394..)
#leverage plot
plot(Auto_fit_1, which = 5)
# 1e) Using * and : symbols to fit linear regression models with interaction effects
Auto_fit_2 <- lm(mpg ~ cylinders * displacement+displacement : weight, data = Auto[, 1:8])
summary(Auto_fit_2)
##
## Call:
## lm(formula = mpg ~ cylinders * displacement + displacement:weight,
## data = Auto[, 1:8])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.2630 -2.5881 -0.2766 2.1773 19.7551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.028e+01 2.357e+00 21.336 < 2e-16 ***
## cylinders -3.305e+00 5.766e-01 -5.732 1.98e-08 ***
## displacement -1.150e-01 1.690e-02 -6.804 3.82e-11 ***
## cylinders:displacement 1.665e-02 2.376e-03 7.007 1.07e-11 ***
## displacement:weight -1.044e-05 2.803e-06 -3.725 0.000224 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.396 on 392 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.6844
## F-statistic: 215.7 on 4 and 392 DF, p-value: < 2.2e-16
Auto_fit_3 = lm(mpg ~.-name+displacement:weight, data = Auto)
summary(Auto_fit_3)
##
## Call:
## lm(formula = mpg ~ . - name + displacement:weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8561 -1.8167 -0.0141 1.7027 12.1594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.130e+01 3.948e+00 -2.863 0.00443 **
## cylinders 2.463e-01 3.079e-01 0.800 0.42424
## displacement -7.153e-02 1.104e-02 -6.479 2.8e-10 ***
## horsepower 2.114e-03 6.129e-03 0.345 0.73029
## weight -1.127e-02 6.854e-04 -16.437 < 2e-16 ***
## acceleration 2.100e-01 6.966e-02 3.014 0.00275 **
## year 8.181e-01 4.448e-02 18.394 < 2e-16 ***
## origin 4.428e-01 2.580e-01 1.716 0.08687 .
## displacement:weight 2.212e-05 2.249e-06 9.833 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.984 on 388 degrees of freedom
## Multiple R-squared: 0.8575, Adjusted R-squared: 0.8546
## F-statistic: 291.9 on 8 and 388 DF, p-value: < 2.2e-16
#The interaction between displacement and weight is statistically signifcant
# 1f) different transformations of the variables, such as log(X), X0.5, X2.
par(mfrow = c(2, 2))

plot(log(Auto$horsepower), Auto$mpg, xlab = "Log(X)", ylab = "mpg")
plot(sqrt(Auto$horsepower), Auto$mpg, xlab = "sqrt(X)", ylab = "mpg")
plot((Auto$horsepower)^2, Auto$mpg, xlab = "X Square", ylab = "mpg")
#################################################################################################################################################
#PART-II
data("Carseats", package = "ISLR")
# 2a) multiple regression model to predict Sales using Price, Urban, and US.
Carseats_1 <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(Carseats_1)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# 2b) The coefficient of the Price variable can be interpreted as
# average effect of a price increase of 1 dollar is a decrease of 54.459 units in sales.
# average of the unit sales in urban location are 21.916 units less than in rural location
# average of the unit sales in a US store are 1200.573 units less than non-US.
# it can be written as follows
# 2c) Sales=13.043469+(−0.054459)×Price+(−0.021916)×Urban+(1.200573)×US + e
# 2d) Price and US null hypothesis can be rejected since the p-value for Urban is greater than 0.05 we cannot reject it.
# 2e) smaller model that only uses the predictors price and US
Carseats_2 <- lm(Sales ~ Price + US, data = Carseats)
summary(Carseats_2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
# 2f)They both fit well, and give a significant understanding of the data. smaller model is better than for the bigger model as it has high R2.
#2g)obtaining 95% confidence intervals for the coefficient(s).
confint(Carseats_2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
