1a

#plot with fitted regression line
library(alr4)
## Loading required package: car
## Loading required package: carData
## Loading required package: effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
attach(ftcollinstemp)
plot(winter~fall, xlab= 'Fall Temp', ylab= 'Winter Temp')
temp.lm = lm(winter~fall)
abline(temp.lm, col='blue')

1b

#testing null hypothesis
summary(temp.lm)
## 
## Call:
## lm(formula = winter ~ fall)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8186 -1.7837 -0.0873  2.1300  7.5896 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  13.7843     7.5549   1.825   0.0708 .
## fall          0.3132     0.1528   2.049   0.0428 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.179 on 109 degrees of freedom
## Multiple R-squared:  0.0371, Adjusted R-squared:  0.02826 
## F-statistic:   4.2 on 1 and 109 DF,  p-value: 0.04284

Since the p-value 0.0428 is greater than α = 0.01, we fail to reject the null hypothesis that β^1=0, thus there is not significant evidence to conclude that there is a linear association between average winter temperatures and average fall temperatures.

1c

#percentage of variability
summary(temp.lm)
## 
## Call:
## lm(formula = winter ~ fall)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8186 -1.7837 -0.0873  2.1300  7.5896 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  13.7843     7.5549   1.825   0.0708 .
## fall          0.3132     0.1528   2.049   0.0428 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.179 on 109 degrees of freedom
## Multiple R-squared:  0.0371, Adjusted R-squared:  0.02826 
## F-statistic:   4.2 on 1 and 109 DF,  p-value: 0.04284

This value is the coefficient of determination, which is R^2 = 0.0372 from the above summary. 3.7% of the variability in average winter temperatures is accounted for by a linear relationship with average fall temperatures.

detach(ftcollinstemp)

2a

#ANOVA
library(faraway)
## 
## Attaching package: 'faraway'
## The following objects are masked from 'package:alr4':
## 
##     cathedral, pipeline, twins
## The following objects are masked from 'package:car':
## 
##     logit, vif
attach(prostate) 
prostate.lm <-lm(lpsa~lcavol)
prostate.aov <- anova(prostate.lm)
prostate.aov
## Analysis of Variance Table
## 
## Response: lpsa
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## lcavol     1 69.003  69.003  111.27 < 2.2e-16 ***
## Residuals 95 58.915   0.620                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#hypothesis test
summary(prostate.lm, alpha=0.01)
## 
## Call:
## lm(formula = lpsa ~ lcavol)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.67625 -0.41648  0.09859  0.50709  1.89673 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.50730    0.12194   12.36   <2e-16 ***
## lcavol       0.71932    0.06819   10.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7875 on 95 degrees of freedom
## Multiple R-squared:  0.5394, Adjusted R-squared:  0.5346 
## F-statistic: 111.3 on 1 and 95 DF,  p-value: < 2.2e-16

Because the p-value is very small and less than α = 0.01, we reject the null hypothesis that the slope, β^1 is 0 conclude that there is a linear association between lpsa and lcavol.

2b

#coefficient of determination
anova(prostate.lm)
## Analysis of Variance Table
## 
## Response: lpsa
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## lcavol     1 69.003  69.003  111.27 < 2.2e-16 ***
## Residuals 95 58.915   0.620                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#R2=SSModel/SSTotal
R2= prostate.aov[[2]][[1]]/(prostate.aov[[2]][[1]]+prostate.aov[[2]][[2]])
R2
## [1] 0.5394319

The coefficient of determination, R^2, is 0.5394. Hence, 54% of the variability in lpsa is explained by the model, a linear relationship with lcavol.

2c

##In the ANOVA table from part (a) or using part (b), which quantity represents the variability in lpsa which is left unexplained by the regression?
prostate.aov <- anova(prostate.lm)
prostate.aov
## Analysis of Variance Table
## 
## Response: lpsa
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## lcavol     1 69.003  69.003  111.27 < 2.2e-16 ***
## Residuals 95 58.915   0.620                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The quantity that represents the variability in lcavol which is left unexplained by regression is the sum of squared residuals, which is 58.915 in this case.

detach(prostate)

3a

##Use the lm function in R to fit the regression of the response on the predictor. Draw a scatterplot of the data and add your fitted regression line.
shortleaf <- read.table("shortleaf.txt", header=TRUE)
fit <- lm(shortleaf$Vol~shortleaf$Diam)
shortleaf.lm <- lm(Vol~Diam, data=shortleaf)
shortleaf.lm
## 
## Call:
## lm(formula = Vol ~ Diam, data = shortleaf)
## 
## Coefficients:
## (Intercept)         Diam  
##     -41.568        6.837
plot(data=shortleaf, Vol~Diam, xlab= 'Diameter of Shortleaf Pines', ylab= 'Volume of Shortleaf Pines')
abline(shortleaf.lm, col="blue")

3b

#residuals vs fits
path <- file.path("~","Desktop","CLASSES","PSTAT126","shortleaf.txt")
dat = read.table(path, header = T)
head(dat)
##   Diam Vol
## 1  4.4 2.0
## 2  4.6 2.2
## 3  5.0 3.0
## 4  5.1 4.3
## 5  5.1 3.0
## 6  5.2 2.9
names(dat)
## [1] "Diam" "Vol"
#x: the predictor
#y: the response
x = dat$Diam
y = dat$Vol

fit = lm(y ~ x)
yhat = fitted(fit)
e = y - yhat

plot(x, y, xlab = 'Diameter', ylab = 'Volume', main = 'Tree Volume vs Diameter', ylim = c(min(yhat), max(y)), cex.lab = 1.3, cex.main = 1.5)
lines(x, fitted(fit), col = 2)