grocery <- read.csv("/Users/caelynsobie/Downloads/grocery-retailer.csv")
library(ggplot2)
#a
pairs(grocery)

#it appears there might be a linear relationship between y and x3. other than that there does not appear to be a linear relationshp between any other varaibles.

#a
cor(grocery)
##            y         x1         x2         x3
## y  1.0000000 0.20766494 0.06002960 0.81057940
## x1 0.2076649 1.00000000 0.08489639 0.04565698
## x2 0.0600296 0.08489639 1.00000000 0.11337076
## x3 0.8105794 0.04565698 0.11337076 1.00000000

#This agrees with our previous finding of only a linear relationship between y and x3 with a correlation of 0.811

#b
model <- lm(formula = y~x1 + x2 + x3, grocery)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = grocery)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -264.05 -110.73  -22.52   79.29  295.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.150e+03  1.956e+02  21.220  < 2e-16 ***
## x1           7.871e-04  3.646e-04   2.159   0.0359 *  
## x2          -1.317e+01  2.309e+01  -0.570   0.5712    
## x3           6.236e+02  6.264e+01   9.954 2.94e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 143.3 on 48 degrees of freedom
## Multiple R-squared:  0.6883, Adjusted R-squared:  0.6689 
## F-statistic: 35.34 on 3 and 48 DF,  p-value: 3.316e-12

#As every case shipped increases by one, the estimated mean total labor hours increase by 7.871e-04. Under alpha = 0.05 this is significant since the p-value is 0.0359.

#As the indirect cost of the total labor hours as a percentage increases by one, the estimated mean total labor hours decreases by 1.317e+01. Under alpha = 0.05 this is insignificant since the p-value is 0.5712.

#When the week is a holiday, the estimated mean total labor hours decreases by 6.236e+02.Under alpha = 0.05 this is significant since the p-value is 2.94e-13.

#c residuals against Yhat, X1,X2,X3,X1X2 and QQ plot
res <- model$residuals
plot(grocery$x1,res)

plot(grocery$x2,res)

plot(grocery$x3,res)

plot(model$fitted.values, res)

x1x2 <- grocery$x1 * grocery$x2
plot(x1x2, res)

qqnorm(res)
qqline(res)

#mostly normal qq plot right scewed
#d
model_red1 <- lm(formula = y~x1 + x2, grocery)
summary(model_red1)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = grocery)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -376.21 -173.77  -49.36  123.73  601.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.995e+03  3.378e+02  11.829 5.72e-16 ***
## x1          9.192e-04  6.312e-04   1.456    0.152    
## x2          1.212e+01  3.977e+01   0.305    0.762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248.3 on 49 degrees of freedom
## Multiple R-squared:  0.04494,    Adjusted R-squared:  0.005953 
## F-statistic: 1.153 on 2 and 49 DF,  p-value: 0.3242
#e
library(car)
## Loading required package: carData
avPlots(model_red1)

#f
model_y_x1 <- lm(y~grocery$x1, data = grocery)
res_y_x1 <- model_y_x1$residuals
summary(model_y_x1)
## 
## Call:
## lm(formula = y ~ grocery$x1, data = grocery)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -356.18 -164.64  -56.07  111.23  619.01 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.080e+03  1.917e+02  21.283   <2e-16 ***
## grocery$x1  9.355e-04  6.232e-04   1.501     0.14    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 246 on 50 degrees of freedom
## Multiple R-squared:  0.04312,    Adjusted R-squared:  0.02399 
## F-statistic: 2.253 on 1 and 50 DF,  p-value: 0.1396
#f
model_x2_x1 <- lm(x2~grocery$x1, data = grocery)
res_x2_x1 <- model_x2_x1$residuals
summary(model_x2_x1)
## 
## Call:
## lm(formula = x2 ~ grocery$x1, data = grocery)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78008 -0.52558 -0.02846  0.56248  2.18874 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.963e+00  6.880e-01  10.120 1.07e-13 ***
## grocery$x1  1.348e-06  2.237e-06   0.602     0.55    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8829 on 50 degrees of freedom
## Multiple R-squared:  0.007207,   Adjusted R-squared:  -0.01265 
## F-statistic: 0.363 on 1 and 50 DF,  p-value: 0.5496
model_res <- lm(res_y_x1 ~ res_x2_x1)
summary(model_res)
## 
## Call:
## lm(formula = res_y_x1 ~ res_x2_x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -376.21 -173.77  -49.36  123.73  601.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.525e-15  3.408e+01   0.000    1.000
## res_x2_x1   1.212e+01  3.937e+01   0.308    0.759
## 
## Residual standard error: 245.8 on 50 degrees of freedom
## Multiple R-squared:  0.001892,   Adjusted R-squared:  -0.01807 
## F-statistic: 0.0948 on 1 and 50 DF,  p-value: 0.7594

Problem 4

library(datasets)
data("stackloss")
#a
stack_model <- lm(stack.loss ~ Air.Flow + Water.Temp + Acid.Conc., data = as.data.frame(stack.x))
summary(stack_model)
## 
## Call:
## lm(formula = stack.loss ~ Air.Flow + Water.Temp + Acid.Conc., 
##     data = as.data.frame(stack.x))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2377 -1.7117 -0.4551  2.3614  5.6978 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39.9197    11.8960  -3.356  0.00375 ** 
## Air.Flow      0.7156     0.1349   5.307  5.8e-05 ***
## Water.Temp    1.2953     0.3680   3.520  0.00263 ** 
## Acid.Conc.   -0.1521     0.1563  -0.973  0.34405    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.243 on 17 degrees of freedom
## Multiple R-squared:  0.9136, Adjusted R-squared:  0.8983 
## F-statistic:  59.9 on 3 and 17 DF,  p-value: 3.016e-09
#b
stack_model_air <- lm(stack.loss ~ Air.Flow, data = as.data.frame(stack.x))
summary(stack_model_air)
## 
## Call:
## lm(formula = stack.loss ~ Air.Flow, data = as.data.frame(stack.x))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2896  -1.1272  -0.0459   1.1166   8.8728 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -44.13202    6.10586  -7.228 7.31e-07 ***
## Air.Flow      1.02031    0.09995  10.208 3.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.098 on 19 degrees of freedom
## Multiple R-squared:  0.8458, Adjusted R-squared:  0.8377 
## F-statistic: 104.2 on 1 and 19 DF,  p-value: 3.774e-09
#c
res_full <- stack.loss - predict(stack_model)
sse_full <-sum(res_full^2)

res_red <- stack.loss - predict(stack_model_air)
sse_red <- sum(res_red^2)
cat("SSE (Full model):", sse_full, "\n")
## SSE (Full model): 178.83
cat("SSE (Reduced model):", sse_red, "\n")
## SSE (Reduced model): 319.1161
n <- nrow(stack.x)

# Degrees of freedom for the full model
p_full <- length(coef(stack_model))  # Number of parameters in full model
df_full <- n - p_full

# Degrees of freedom for the reduced model
df_reduced <- n - 2

cat("Degrees of Freedom (Full model):", df_full, "\n")
## Degrees of Freedom (Full model): 17
cat("Degrees of Freedom (Reduced model):", df_reduced, "\n")
## Degrees of Freedom (Reduced model): 19
#d
f_stat_num <- (sse_red - sse_full)/(df_reduced - df_full)
f_stat_dem <- sse_full / df_full
f_stat <- f_stat_num / f_stat_dem
f_stat
## [1] 6.667967
crit_f <- qf(1-0.05, df_reduced - df_full, df_full)
crit_f
## [1] 3.591531
#e
pf(f_stat, df_reduced - df_full, df_full, lower.tail = FALSE)
## [1] 0.007280786

Problem 5

library(faraway)
## 
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
## 
##     logit, vif
data("sat")
#a
sat_model <- lm(total ~ expend + salary + ratio, data = sat)
summary(sat_model)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio, data = sat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -140.911  -46.740   -7.535   47.966  123.329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1069.234    110.925   9.639 1.29e-12 ***
## expend        16.469     22.050   0.747   0.4589    
## salary        -8.823      4.697  -1.878   0.0667 .  
## ratio          6.330      6.542   0.968   0.3383    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.65 on 46 degrees of freedom
## Multiple R-squared:  0.2096, Adjusted R-squared:  0.1581 
## F-statistic: 4.066 on 3 and 46 DF,  p-value: 0.01209

#b) As the expenditure per pupil increases by one, the estimated mean SAT score increases by 16.469. This appears to be insigificant since with alpha = 0.05, the p-value is 0.4589.

#As the estimated average salary of teachers increases by one, the estimated mean SAT score decreases by 8.823. This appears to be insigificant since with alpha = 0.05, the p-value is 0.0667.

#b) As the average pupil/teacher ratio increases by one, the estimated mean SAT score increases by 6.330. This appears to be insigificant since with alpha = 0.05, the p-value is 0.3383.

#c
sat_model_take <- lm(total ~ expend + salary + ratio + takers, data = sat)
summary(sat_model_take)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1045.9715    52.8698  19.784  < 2e-16 ***
## expend         4.4626    10.5465   0.423    0.674    
## salary         1.6379     2.3872   0.686    0.496    
## ratio         -3.6242     3.2154  -1.127    0.266    
## takers        -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16
#d
esr <- rstudent(sat_model_take)
plot(esr)

alpha = 0.1
num = nrow(sat)
bon_test <- qt(1-(alpha/(2*num)),num-5)
bon_test
## [1] 3.28148
#e
influence.measures(sat_model_take)
## Influence measures of
##   lm(formula = total ~ expend + salary + ratio + takers, data = sat) :
## 
##                   dfb.1_  dfb.expn dfb.slry dfb.rati  dfb.tkrs   dffit cov.r
## Alabama        -0.010968  0.016083 -0.01465  0.01246  0.009287 -0.0213 1.236
## Alaska          0.329192 -0.203908  0.01885 -0.15367  0.162010 -0.4788 1.214
## Arizona         0.039954  0.012636  0.01592 -0.08386 -0.031505 -0.1968 1.082
## Arkansas       -0.102502  0.072421 -0.04365  0.05947  0.086620 -0.2160 1.078
## California      0.174948  0.118793 -0.12779 -0.11353 -0.037696 -0.3558 1.503
## Colorado       -0.004665 -0.002878  0.00275  0.00542 -0.001721  0.0218 1.152
## Connecticut     0.043301 -0.241875  0.35844 -0.29431 -0.042669  0.5277 1.297
## Delaware       -0.026896  0.036038 -0.03465  0.03347  0.064331  0.1034 1.165
## Florida         0.171083 -0.251733  0.28991 -0.30547 -0.189656 -0.3626 1.190
## Georgia        -0.089786  0.056707  0.00746  0.02730 -0.162854 -0.2076 1.179
## Hawaii          0.008805  0.052365 -0.04880  0.00690 -0.048352 -0.1218 1.144
## Idaho          -0.010452  0.043337 -0.01201 -0.03068  0.006906 -0.1698 1.128
## Illinois        0.010840 -0.077886  0.10749 -0.06730 -0.094617  0.1352 1.276
## Indiana        -0.012996  0.054590 -0.03757  0.01147 -0.072887 -0.1315 1.135
## Iowa            0.146491 -0.022347  0.04979 -0.13371 -0.269136  0.3901 0.908
## Kansas          0.064367 -0.066918  0.08342 -0.09488 -0.093129  0.1326 1.227
## Kentucky       -0.055584  0.053671 -0.05914  0.05460  0.115230 -0.1905 1.061
## Louisiana      -0.002552 -0.013671  0.01744 -0.00966  0.000612 -0.0248 1.225
## Maine           0.065238  0.089610 -0.13058  0.01925  0.142860  0.2174 1.258
## Maryland       -0.036694  0.028005 -0.01726  0.02913  0.036164  0.0860 1.165
## Massachusetts   0.121425 -0.115373  0.11054 -0.17372  0.252030  0.4840 0.941
## Michigan        0.083614 -0.030896 -0.00787 -0.04800  0.069961 -0.1181 1.330
## Minnesota      -0.053012 -0.018528  0.10317 -0.02670 -0.276240  0.3477 0.967
## Mississippi     0.009327 -0.003042 -0.00194 -0.00071 -0.005181  0.0237 1.192
## Missouri        0.025220 -0.008855  0.00962 -0.02253 -0.028187  0.0473 1.180
## Montana        -0.013822  0.101987 -0.11053  0.06839  0.012165  0.1265 1.257
## Nebraska        0.019231  0.014320 -0.01128 -0.01177 -0.026133  0.0518 1.211
## Nevada          0.011256  0.193287 -0.17732  0.01371  0.025487 -0.3777 0.843
## New Hampshire   0.288470 -0.177760  0.02122 -0.15660  0.495646  0.6582 0.727
## New Jersey      0.088106 -0.161538  0.09431 -0.05161  0.001040 -0.2525 1.402
## New Mexico     -0.008748 -0.000345  0.00506 -0.00103  0.004880 -0.0258 1.170
## New York        0.039442 -0.046978  0.02375 -0.02266 -0.003108 -0.0799 1.380
## North Carolina -0.089982  0.023094  0.04725  0.00543 -0.167509 -0.2121 1.169
## North Dakota    0.329866  0.147137 -0.24289 -0.06672 -0.157294  0.6574 0.718
## Ohio           -0.027760  0.075672 -0.11966  0.09067  0.137909 -0.2153 1.048
## Oklahoma       -0.022026  0.002617  0.00214  0.01178  0.011361 -0.0347 1.187
## Oregon         -0.235979  0.157568 -0.13770  0.25652  0.105385  0.3123 1.122
## Pennsylvania    0.000111  0.038757 -0.04620  0.02513 -0.007381 -0.0642 1.271
## Rhode Island   -0.002191  0.001001 -0.00193  0.00401 -0.004106 -0.0120 1.200
## South Carolina -0.229763  0.117011  0.04964  0.04909 -0.360615 -0.4806 0.975
## South Dakota    0.173443  0.016503 -0.05486 -0.08362 -0.068242  0.2604 1.153
## Tennessee       0.063024 -0.174079  0.16131 -0.08677 -0.083963  0.2385 1.156
## Texas          -0.163161  0.065455  0.01142  0.07415 -0.139793 -0.2560 1.040
## Utah           -0.958297  0.542396 -0.64657  1.32142  0.120463  1.6250 0.802
## Vermont         0.061879  0.014801 -0.03245 -0.03386  0.080162  0.1400 1.195
## Virginia        0.123009 -0.121882  0.08548 -0.11635  0.070409  0.1715 1.296
## Washington     -0.136046  0.102315 -0.10643  0.17308  0.084684  0.2024 1.208
## West Virginia  -0.242009 -0.300211  0.25200  0.10725  0.320984 -0.8037 0.438
## Wisconsin      -0.019985  0.030712  0.02831 -0.02878 -0.153695  0.1902 1.195
## Wyoming        -0.058141 -0.215000  0.18064 -0.01640  0.176116 -0.4062 1.012
##                  cook.d    hat inf
## Alabama        9.32e-05 0.0954    
## Alaska         4.58e-02 0.1803    
## Arizona        7.79e-03 0.0493    
## Arkansas       9.37e-03 0.0538    
## California     2.57e-02 0.2821   *
## Colorado       9.73e-05 0.0301    
## Connecticut    5.57e-02 0.2255    
## Delaware       2.18e-03 0.0582    
## Florida        2.64e-02 0.1407    
## Georgia        8.73e-03 0.0942    
## Hawaii         3.01e-03 0.0516    
## Idaho          5.84e-03 0.0585    
## Illinois       3.73e-03 0.1348    
## Indiana        3.51e-03 0.0503    
## Iowa           2.95e-02 0.0588    
## Kansas         3.59e-03 0.1038    
## Kentucky       7.28e-03 0.0411    
## Louisiana      1.26e-04 0.0875    
## Maine          9.61e-03 0.1397    
## Maryland       1.51e-03 0.0537    
## Massachusetts  4.54e-02 0.0887    
## Michigan       2.85e-03 0.1654    
## Minnesota      2.37e-02 0.0607    
## Mississippi    1.15e-04 0.0625    
## Missouri       4.56e-04 0.0559    
## Montana        3.27e-03 0.1212    
## Nebraska       5.48e-04 0.0795    
## Nevada         2.73e-02 0.0454    
## New Hampshire  7.99e-02 0.0828    
## New Jersey     1.30e-02 0.2221   *
## New Mexico     1.36e-04 0.0453    
## New York       1.31e-03 0.1916   *
## North Carolina 9.11e-03 0.0906    
## North Dakota   7.95e-02 0.0810    
## Ohio           9.27e-03 0.0448    
## Oklahoma       2.46e-04 0.0595    
## Oregon         1.96e-02 0.0983    
## Pennsylvania   8.42e-04 0.1228    
## Rhode Island   2.94e-05 0.0674    
## South Carolina 4.50e-02 0.0967    
## South Dakota   1.37e-02 0.0966    
## Tennessee      1.15e-02 0.0916    
## Texas          1.31e-02 0.0541    
## Utah           4.72e-01 0.2921   *
## Vermont        3.99e-03 0.0857    
## Virginia       5.99e-03 0.1523    
## Washington     8.32e-03 0.1082    
## West Virginia  1.08e-01 0.0621   *
## Wisconsin      7.34e-03 0.0981    
## Wyoming        3.25e-02 0.0875
cooks.distance(sat_model_take)
##        Alabama         Alaska        Arizona       Arkansas     California 
##   9.320823e-05   4.581008e-02   7.787666e-03   9.366213e-03   2.571304e-02 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##   9.733862e-05   5.574806e-02   2.178814e-03   2.641399e-02   8.732755e-03 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##   3.014830e-03   5.837035e-03   3.727482e-03   3.508650e-03   2.949550e-02 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##   3.586302e-03   7.280787e-03   1.260173e-04   9.607378e-03   1.508793e-03 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##   4.543308e-02   2.846396e-03   2.372007e-02   1.146592e-04   4.562770e-04 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##   3.265407e-03   5.480389e-04   2.731373e-02   7.989442e-02   1.297264e-02 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##   1.357578e-04   1.305355e-03   9.111288e-03   7.954292e-02   9.271069e-03 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##   2.457687e-04   1.955161e-02   8.418630e-04   2.936388e-05   4.504204e-02 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##   1.367187e-02   1.148386e-02   1.306408e-02   4.715287e-01   3.991940e-03 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##   5.990470e-03   8.318437e-03   1.081395e-01   7.340931e-03   3.247841e-02
#nh utah wv
plot(sat_model_take,4)

plot(sat_model_take,5)