grocery <- read.csv("/Users/caelynsobie/Downloads/grocery-retailer.csv")
library(ggplot2)
#a
pairs(grocery)
#it appears there might be a linear relationship between y and x3. other
than that there does not appear to be a linear relationshp between any
other varaibles.
#a
cor(grocery)
## y x1 x2 x3
## y 1.0000000 0.20766494 0.06002960 0.81057940
## x1 0.2076649 1.00000000 0.08489639 0.04565698
## x2 0.0600296 0.08489639 1.00000000 0.11337076
## x3 0.8105794 0.04565698 0.11337076 1.00000000
#This agrees with our previous finding of only a linear relationship between y and x3 with a correlation of 0.811
#b
model <- lm(formula = y~x1 + x2 + x3, grocery)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = grocery)
##
## Residuals:
## Min 1Q Median 3Q Max
## -264.05 -110.73 -22.52 79.29 295.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.150e+03 1.956e+02 21.220 < 2e-16 ***
## x1 7.871e-04 3.646e-04 2.159 0.0359 *
## x2 -1.317e+01 2.309e+01 -0.570 0.5712
## x3 6.236e+02 6.264e+01 9.954 2.94e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143.3 on 48 degrees of freedom
## Multiple R-squared: 0.6883, Adjusted R-squared: 0.6689
## F-statistic: 35.34 on 3 and 48 DF, p-value: 3.316e-12
#As every case shipped increases by one, the estimated mean total labor hours increase by 7.871e-04. Under alpha = 0.05 this is significant since the p-value is 0.0359.
#As the indirect cost of the total labor hours as a percentage increases by one, the estimated mean total labor hours decreases by 1.317e+01. Under alpha = 0.05 this is insignificant since the p-value is 0.5712.
#When the week is a holiday, the estimated mean total labor hours decreases by 6.236e+02.Under alpha = 0.05 this is significant since the p-value is 2.94e-13.
#c residuals against Yhat, X1,X2,X3,X1X2 and QQ plot
res <- model$residuals
plot(grocery$x1,res)
plot(grocery$x2,res)
plot(grocery$x3,res)
plot(model$fitted.values, res)
x1x2 <- grocery$x1 * grocery$x2
plot(x1x2, res)
qqnorm(res)
qqline(res)
#mostly normal qq plot right scewed
#d
model_red1 <- lm(formula = y~x1 + x2, grocery)
summary(model_red1)
##
## Call:
## lm(formula = y ~ x1 + x2, data = grocery)
##
## Residuals:
## Min 1Q Median 3Q Max
## -376.21 -173.77 -49.36 123.73 601.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.995e+03 3.378e+02 11.829 5.72e-16 ***
## x1 9.192e-04 6.312e-04 1.456 0.152
## x2 1.212e+01 3.977e+01 0.305 0.762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248.3 on 49 degrees of freedom
## Multiple R-squared: 0.04494, Adjusted R-squared: 0.005953
## F-statistic: 1.153 on 2 and 49 DF, p-value: 0.3242
#e
library(car)
## Loading required package: carData
avPlots(model_red1)
#f
model_y_x1 <- lm(y~grocery$x1, data = grocery)
res_y_x1 <- model_y_x1$residuals
summary(model_y_x1)
##
## Call:
## lm(formula = y ~ grocery$x1, data = grocery)
##
## Residuals:
## Min 1Q Median 3Q Max
## -356.18 -164.64 -56.07 111.23 619.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.080e+03 1.917e+02 21.283 <2e-16 ***
## grocery$x1 9.355e-04 6.232e-04 1.501 0.14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 246 on 50 degrees of freedom
## Multiple R-squared: 0.04312, Adjusted R-squared: 0.02399
## F-statistic: 2.253 on 1 and 50 DF, p-value: 0.1396
#f
model_x2_x1 <- lm(x2~grocery$x1, data = grocery)
res_x2_x1 <- model_x2_x1$residuals
summary(model_x2_x1)
##
## Call:
## lm(formula = x2 ~ grocery$x1, data = grocery)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78008 -0.52558 -0.02846 0.56248 2.18874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.963e+00 6.880e-01 10.120 1.07e-13 ***
## grocery$x1 1.348e-06 2.237e-06 0.602 0.55
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8829 on 50 degrees of freedom
## Multiple R-squared: 0.007207, Adjusted R-squared: -0.01265
## F-statistic: 0.363 on 1 and 50 DF, p-value: 0.5496
model_res <- lm(res_y_x1 ~ res_x2_x1)
summary(model_res)
##
## Call:
## lm(formula = res_y_x1 ~ res_x2_x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -376.21 -173.77 -49.36 123.73 601.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.525e-15 3.408e+01 0.000 1.000
## res_x2_x1 1.212e+01 3.937e+01 0.308 0.759
##
## Residual standard error: 245.8 on 50 degrees of freedom
## Multiple R-squared: 0.001892, Adjusted R-squared: -0.01807
## F-statistic: 0.0948 on 1 and 50 DF, p-value: 0.7594
Problem 4
library(datasets)
data("stackloss")
#a
stack_model <- lm(stack.loss ~ Air.Flow + Water.Temp + Acid.Conc., data = as.data.frame(stack.x))
summary(stack_model)
##
## Call:
## lm(formula = stack.loss ~ Air.Flow + Water.Temp + Acid.Conc.,
## data = as.data.frame(stack.x))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2377 -1.7117 -0.4551 2.3614 5.6978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -39.9197 11.8960 -3.356 0.00375 **
## Air.Flow 0.7156 0.1349 5.307 5.8e-05 ***
## Water.Temp 1.2953 0.3680 3.520 0.00263 **
## Acid.Conc. -0.1521 0.1563 -0.973 0.34405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.243 on 17 degrees of freedom
## Multiple R-squared: 0.9136, Adjusted R-squared: 0.8983
## F-statistic: 59.9 on 3 and 17 DF, p-value: 3.016e-09
#b
stack_model_air <- lm(stack.loss ~ Air.Flow, data = as.data.frame(stack.x))
summary(stack_model_air)
##
## Call:
## lm(formula = stack.loss ~ Air.Flow, data = as.data.frame(stack.x))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2896 -1.1272 -0.0459 1.1166 8.8728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -44.13202 6.10586 -7.228 7.31e-07 ***
## Air.Flow 1.02031 0.09995 10.208 3.77e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.098 on 19 degrees of freedom
## Multiple R-squared: 0.8458, Adjusted R-squared: 0.8377
## F-statistic: 104.2 on 1 and 19 DF, p-value: 3.774e-09
#c
res_full <- stack.loss - predict(stack_model)
sse_full <-sum(res_full^2)
res_red <- stack.loss - predict(stack_model_air)
sse_red <- sum(res_red^2)
cat("SSE (Full model):", sse_full, "\n")
## SSE (Full model): 178.83
cat("SSE (Reduced model):", sse_red, "\n")
## SSE (Reduced model): 319.1161
n <- nrow(stack.x)
# Degrees of freedom for the full model
p_full <- length(coef(stack_model)) # Number of parameters in full model
df_full <- n - p_full
# Degrees of freedom for the reduced model
df_reduced <- n - 2
cat("Degrees of Freedom (Full model):", df_full, "\n")
## Degrees of Freedom (Full model): 17
cat("Degrees of Freedom (Reduced model):", df_reduced, "\n")
## Degrees of Freedom (Reduced model): 19
#d
f_stat_num <- (sse_red - sse_full)/(df_reduced - df_full)
f_stat_dem <- sse_full / df_full
f_stat <- f_stat_num / f_stat_dem
f_stat
## [1] 6.667967
crit_f <- qf(1-0.05, df_reduced - df_full, df_full)
crit_f
## [1] 3.591531
#e
pf(f_stat, df_reduced - df_full, df_full, lower.tail = FALSE)
## [1] 0.007280786
Problem 5
library(faraway)
##
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
##
## logit, vif
data("sat")
#a
sat_model <- lm(total ~ expend + salary + ratio, data = sat)
summary(sat_model)
##
## Call:
## lm(formula = total ~ expend + salary + ratio, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -140.911 -46.740 -7.535 47.966 123.329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1069.234 110.925 9.639 1.29e-12 ***
## expend 16.469 22.050 0.747 0.4589
## salary -8.823 4.697 -1.878 0.0667 .
## ratio 6.330 6.542 0.968 0.3383
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.65 on 46 degrees of freedom
## Multiple R-squared: 0.2096, Adjusted R-squared: 0.1581
## F-statistic: 4.066 on 3 and 46 DF, p-value: 0.01209
#b) As the expenditure per pupil increases by one, the estimated mean SAT score increases by 16.469. This appears to be insigificant since with alpha = 0.05, the p-value is 0.4589.
#As the estimated average salary of teachers increases by one, the estimated mean SAT score decreases by 8.823. This appears to be insigificant since with alpha = 0.05, the p-value is 0.0667.
#b) As the average pupil/teacher ratio increases by one, the estimated mean SAT score increases by 6.330. This appears to be insigificant since with alpha = 0.05, the p-value is 0.3383.
#c
sat_model_take <- lm(total ~ expend + salary + ratio + takers, data = sat)
summary(sat_model_take)
##
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1045.9715 52.8698 19.784 < 2e-16 ***
## expend 4.4626 10.5465 0.423 0.674
## salary 1.6379 2.3872 0.686 0.496
## ratio -3.6242 3.2154 -1.127 0.266
## takers -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
#d
esr <- rstudent(sat_model_take)
plot(esr)
alpha = 0.1
num = nrow(sat)
bon_test <- qt(1-(alpha/(2*num)),num-5)
bon_test
## [1] 3.28148
#e
influence.measures(sat_model_take)
## Influence measures of
## lm(formula = total ~ expend + salary + ratio + takers, data = sat) :
##
## dfb.1_ dfb.expn dfb.slry dfb.rati dfb.tkrs dffit cov.r
## Alabama -0.010968 0.016083 -0.01465 0.01246 0.009287 -0.0213 1.236
## Alaska 0.329192 -0.203908 0.01885 -0.15367 0.162010 -0.4788 1.214
## Arizona 0.039954 0.012636 0.01592 -0.08386 -0.031505 -0.1968 1.082
## Arkansas -0.102502 0.072421 -0.04365 0.05947 0.086620 -0.2160 1.078
## California 0.174948 0.118793 -0.12779 -0.11353 -0.037696 -0.3558 1.503
## Colorado -0.004665 -0.002878 0.00275 0.00542 -0.001721 0.0218 1.152
## Connecticut 0.043301 -0.241875 0.35844 -0.29431 -0.042669 0.5277 1.297
## Delaware -0.026896 0.036038 -0.03465 0.03347 0.064331 0.1034 1.165
## Florida 0.171083 -0.251733 0.28991 -0.30547 -0.189656 -0.3626 1.190
## Georgia -0.089786 0.056707 0.00746 0.02730 -0.162854 -0.2076 1.179
## Hawaii 0.008805 0.052365 -0.04880 0.00690 -0.048352 -0.1218 1.144
## Idaho -0.010452 0.043337 -0.01201 -0.03068 0.006906 -0.1698 1.128
## Illinois 0.010840 -0.077886 0.10749 -0.06730 -0.094617 0.1352 1.276
## Indiana -0.012996 0.054590 -0.03757 0.01147 -0.072887 -0.1315 1.135
## Iowa 0.146491 -0.022347 0.04979 -0.13371 -0.269136 0.3901 0.908
## Kansas 0.064367 -0.066918 0.08342 -0.09488 -0.093129 0.1326 1.227
## Kentucky -0.055584 0.053671 -0.05914 0.05460 0.115230 -0.1905 1.061
## Louisiana -0.002552 -0.013671 0.01744 -0.00966 0.000612 -0.0248 1.225
## Maine 0.065238 0.089610 -0.13058 0.01925 0.142860 0.2174 1.258
## Maryland -0.036694 0.028005 -0.01726 0.02913 0.036164 0.0860 1.165
## Massachusetts 0.121425 -0.115373 0.11054 -0.17372 0.252030 0.4840 0.941
## Michigan 0.083614 -0.030896 -0.00787 -0.04800 0.069961 -0.1181 1.330
## Minnesota -0.053012 -0.018528 0.10317 -0.02670 -0.276240 0.3477 0.967
## Mississippi 0.009327 -0.003042 -0.00194 -0.00071 -0.005181 0.0237 1.192
## Missouri 0.025220 -0.008855 0.00962 -0.02253 -0.028187 0.0473 1.180
## Montana -0.013822 0.101987 -0.11053 0.06839 0.012165 0.1265 1.257
## Nebraska 0.019231 0.014320 -0.01128 -0.01177 -0.026133 0.0518 1.211
## Nevada 0.011256 0.193287 -0.17732 0.01371 0.025487 -0.3777 0.843
## New Hampshire 0.288470 -0.177760 0.02122 -0.15660 0.495646 0.6582 0.727
## New Jersey 0.088106 -0.161538 0.09431 -0.05161 0.001040 -0.2525 1.402
## New Mexico -0.008748 -0.000345 0.00506 -0.00103 0.004880 -0.0258 1.170
## New York 0.039442 -0.046978 0.02375 -0.02266 -0.003108 -0.0799 1.380
## North Carolina -0.089982 0.023094 0.04725 0.00543 -0.167509 -0.2121 1.169
## North Dakota 0.329866 0.147137 -0.24289 -0.06672 -0.157294 0.6574 0.718
## Ohio -0.027760 0.075672 -0.11966 0.09067 0.137909 -0.2153 1.048
## Oklahoma -0.022026 0.002617 0.00214 0.01178 0.011361 -0.0347 1.187
## Oregon -0.235979 0.157568 -0.13770 0.25652 0.105385 0.3123 1.122
## Pennsylvania 0.000111 0.038757 -0.04620 0.02513 -0.007381 -0.0642 1.271
## Rhode Island -0.002191 0.001001 -0.00193 0.00401 -0.004106 -0.0120 1.200
## South Carolina -0.229763 0.117011 0.04964 0.04909 -0.360615 -0.4806 0.975
## South Dakota 0.173443 0.016503 -0.05486 -0.08362 -0.068242 0.2604 1.153
## Tennessee 0.063024 -0.174079 0.16131 -0.08677 -0.083963 0.2385 1.156
## Texas -0.163161 0.065455 0.01142 0.07415 -0.139793 -0.2560 1.040
## Utah -0.958297 0.542396 -0.64657 1.32142 0.120463 1.6250 0.802
## Vermont 0.061879 0.014801 -0.03245 -0.03386 0.080162 0.1400 1.195
## Virginia 0.123009 -0.121882 0.08548 -0.11635 0.070409 0.1715 1.296
## Washington -0.136046 0.102315 -0.10643 0.17308 0.084684 0.2024 1.208
## West Virginia -0.242009 -0.300211 0.25200 0.10725 0.320984 -0.8037 0.438
## Wisconsin -0.019985 0.030712 0.02831 -0.02878 -0.153695 0.1902 1.195
## Wyoming -0.058141 -0.215000 0.18064 -0.01640 0.176116 -0.4062 1.012
## cook.d hat inf
## Alabama 9.32e-05 0.0954
## Alaska 4.58e-02 0.1803
## Arizona 7.79e-03 0.0493
## Arkansas 9.37e-03 0.0538
## California 2.57e-02 0.2821 *
## Colorado 9.73e-05 0.0301
## Connecticut 5.57e-02 0.2255
## Delaware 2.18e-03 0.0582
## Florida 2.64e-02 0.1407
## Georgia 8.73e-03 0.0942
## Hawaii 3.01e-03 0.0516
## Idaho 5.84e-03 0.0585
## Illinois 3.73e-03 0.1348
## Indiana 3.51e-03 0.0503
## Iowa 2.95e-02 0.0588
## Kansas 3.59e-03 0.1038
## Kentucky 7.28e-03 0.0411
## Louisiana 1.26e-04 0.0875
## Maine 9.61e-03 0.1397
## Maryland 1.51e-03 0.0537
## Massachusetts 4.54e-02 0.0887
## Michigan 2.85e-03 0.1654
## Minnesota 2.37e-02 0.0607
## Mississippi 1.15e-04 0.0625
## Missouri 4.56e-04 0.0559
## Montana 3.27e-03 0.1212
## Nebraska 5.48e-04 0.0795
## Nevada 2.73e-02 0.0454
## New Hampshire 7.99e-02 0.0828
## New Jersey 1.30e-02 0.2221 *
## New Mexico 1.36e-04 0.0453
## New York 1.31e-03 0.1916 *
## North Carolina 9.11e-03 0.0906
## North Dakota 7.95e-02 0.0810
## Ohio 9.27e-03 0.0448
## Oklahoma 2.46e-04 0.0595
## Oregon 1.96e-02 0.0983
## Pennsylvania 8.42e-04 0.1228
## Rhode Island 2.94e-05 0.0674
## South Carolina 4.50e-02 0.0967
## South Dakota 1.37e-02 0.0966
## Tennessee 1.15e-02 0.0916
## Texas 1.31e-02 0.0541
## Utah 4.72e-01 0.2921 *
## Vermont 3.99e-03 0.0857
## Virginia 5.99e-03 0.1523
## Washington 8.32e-03 0.1082
## West Virginia 1.08e-01 0.0621 *
## Wisconsin 7.34e-03 0.0981
## Wyoming 3.25e-02 0.0875
cooks.distance(sat_model_take)
## Alabama Alaska Arizona Arkansas California
## 9.320823e-05 4.581008e-02 7.787666e-03 9.366213e-03 2.571304e-02
## Colorado Connecticut Delaware Florida Georgia
## 9.733862e-05 5.574806e-02 2.178814e-03 2.641399e-02 8.732755e-03
## Hawaii Idaho Illinois Indiana Iowa
## 3.014830e-03 5.837035e-03 3.727482e-03 3.508650e-03 2.949550e-02
## Kansas Kentucky Louisiana Maine Maryland
## 3.586302e-03 7.280787e-03 1.260173e-04 9.607378e-03 1.508793e-03
## Massachusetts Michigan Minnesota Mississippi Missouri
## 4.543308e-02 2.846396e-03 2.372007e-02 1.146592e-04 4.562770e-04
## Montana Nebraska Nevada New Hampshire New Jersey
## 3.265407e-03 5.480389e-04 2.731373e-02 7.989442e-02 1.297264e-02
## New Mexico New York North Carolina North Dakota Ohio
## 1.357578e-04 1.305355e-03 9.111288e-03 7.954292e-02 9.271069e-03
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2.457687e-04 1.955161e-02 8.418630e-04 2.936388e-05 4.504204e-02
## South Dakota Tennessee Texas Utah Vermont
## 1.367187e-02 1.148386e-02 1.306408e-02 4.715287e-01 3.991940e-03
## Virginia Washington West Virginia Wisconsin Wyoming
## 5.990470e-03 8.318437e-03 1.081395e-01 7.340931e-03 3.247841e-02
#nh utah wv
plot(sat_model_take,4)
plot(sat_model_take,5)