This question is from the textbook: Regression Analysis page 23, Exercises 1.2.
Give an example in any area of interest to you (other than those already presented in Chapter 1) where regression analysis can be used as a data analytic tool to answer some questions of interest.
(2 points)-The relationship between car distance and price.
(2 points)-response variables is dist,predictor variables is price.
(2 points)-dist and price are quantitative.
Regression Analysis page 18, Table 1.15) can be used to analyze the data? (2 points)-linear regression.
(2 points)-\[y=\beta_0+\beta_1x+\varepsilon\]
data(cars)
y <- cars$dist
x <- cars$price
lr <- (y~x)
summary(lr)
## Length Class Mode
## 3 formula call
\[y=0.2316x-8.1671\] ## Question 2
Every ggplot2 plot has three key components:
Data.
A set of aesthetic mappings between variables in the data and visual properties.
At least one layer which describes how to render each observation. Layers are usually created with a geom function.
This exercises use the data set mpg. For example:
library(ggplot2)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point()
1. Draw a scatter plot of cty and hwy using ggplot2. How would you describe the relationship between cty and hwy?
(2 points)
library(ggplot2)
ggplot(mpg, aes(x = cty, y = hwy)) +
geom_point()
-hwy increases as cty increases,they are positively correlated.
ggplot(mpg, aes(x = cty, y = hwy, colour = class)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, shape = drv)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, size = cyl)) +
geom_point()
Experiment with the colour, shape and size aesthetics. What happens when you map them to continuous values? What about categorical values? What happens when you use more than one aesthetic in a plot? (2 points)
ggplot(mpg, aes(x = cty, y = hwy, colour = class)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, shape = drv)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, size = cyl)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, colour = hwy)) +
geom_point()
# ggplot(mpg, aes(x = cty, y = hwy, shape = hwy)) +
# geom_point()
shape can not equal continuous values.
ggplot(mpg, aes(x = cty, y = hwy, size = hwy)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, colour = class)) +
geom_point()
ggplot(mpg, aes(x = cty, y = hwy, shape = class)) +
geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
ggplot(mpg, aes(x = cty, y = hwy, size = class)) +
geom_point()
## Warning: Using size for a discrete variable is not advised.
Continuous variables than categorical variables that situation will get even more,describe the relationship between variables more specifically.
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(~class)
What happens if you try to facet by a continuous variable like hwy? What about cyl? What’s the key difference? (2 points)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(~hwy)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(~cty)
(displ, hwy) according to the values of the variables hwy and cty, they are divided into different categories.
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(span = 0.75)
An important argument to geom_smooth() is the method, which allows you to choose which type of model is used to fit the smooth curve.
?loess). Try different parameter values of span and see what happens. (2 points)ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(span = 0.75,method = loess,formula = y ~ x)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(span = 0.95,method = loess,formula = y ~ x)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x)
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x))
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x,bs="cs"))
The data set walleye in the package alr4 of data measured on walleye fish in Wiscocin.
(2 points)library(alr4)
## Warning: 程辑包'alr4'是用R版本4.1.3 来建造的
## 载入需要的程辑包:car
## Warning: 程辑包'car'是用R版本4.1.3 来建造的
## 载入需要的程辑包:carData
## Warning: 程辑包'carData'是用R版本4.1.3 来建造的
## 载入需要的程辑包:effects
## Warning: 程辑包'effects'是用R版本4.1.3 来建造的
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(ggplot2)
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ages <- filter(walleye, age %in% seq(5,8))
ggplot(data = ages,aes(x = age,y = length, group = age)) +
geom_boxplot()
(2 points)library(alr4)
library(ggplot2)
library(dplyr)
ages <- filter(walleye, age %in% seq(5,8))
age5 <- filter(ages,age == 5)
age6 <- filter(ages,age == 6)
age7 <- filter(ages,age == 7)
age8 <- filter(ages,age == 8)
m1 <- mean(age5$length)
m2 <- mean(age6$length)
m3 <- mean(age7$length)
m4 <- mean(age8$length)
s1 <- sd(age5$length)
s2 <- sd(age6$length)
s3 <- sd(age7$length)
s4 <- sd(age8$length)
Name <- c('age==5','age==6','age==7','age==8')
Mean <- c(m1,m2,m3,m4)
Sd <- c(s1,s2,s3,s4)
dat1 <- data.frame(Name,Mean,Sd)
dat1
## Name Mean Sd
## 1 age==5 364.0497 30.54235
## 2 age==6 384.3787 29.06897
## 3 age==7 402.8783 27.83498
## 4 age==8 420.0894 28.38588
(2 points)library(ggplot2)
library(magrittr)
library(dplyr)
library(patchwork)
## Warning: 程辑包'patchwork'是用R版本4.1.3 来建造的
p5 <- ggplot() +
geom_histogram(data=age5,aes(x=length),stat='bin',bins=20,fill='darkgreen',color='gray') +
ggtitle("age5")
p6 <- ggplot()+
geom_histogram(data=age6,aes(x=length),stat='bin',bins=20,fill='darkgreen',color='gray') +
ggtitle("age6")
p7 <- ggplot()+
geom_histogram(data=age7,aes(x=length),stat='bin',bins=20,fill='darkgreen',color='gray') +
ggtitle("age7")
p8 <- ggplot()+
geom_histogram(data=age8,aes(x=length),stat='bin',bins=20,fill='darkgreen',color='gray') +
ggtitle("age8")
(p5+p6)/(p7+p8)
(2 points)library(alr4)
library(dplyr)
ages <- filter(walleye, age %in% seq(5,8))
age5 <- filter(ages,age == 5)
age7 <- filter(ages,age == 7)
t5 <- t.test(age5$length)
t7 <- t.test(age7$length)
Name <- c('age==5','age==7')
Min <- c(t5$conf.int[1],t7$conf.int[1])
Max <- c(t5$conf.int[2],t7$conf.int[2])
dat2 <- data.frame(Name,Min,Max)
dat2
## Name Min Max
## 1 age==5 360.7473 367.3522
## 2 age==7 398.4025 407.3541
Suppose that the length of the sample is equal to the average length.
(2 points)library(alr4)
library(dplyr)
ages <- filter(walleye, age %in% seq(5,8))
age5 <- filter(ages,age == 5)
age7 <- filter(ages,age == 7)
confint<-function(x,sigma=-1,alpha=0.05)
{
n<-length(x)
xb<-mean(x)
if(sigma>=0)
{
tmp<-sigma/sqrt(n)*qnorm(1-alpha/2);df<-n
}
else{
tmp<-sd(x)/sqrt(n)*qt(1-alpha/2,n-1);df<- n-1
}
data.frame(mean=xb,df=df,a=xb-tmp,b=xb+tmp)
}
C5 <- confint(age5$length,10)
C7 <- confint(age7$length,10)
Name <- c('age==5','age==7')
Mean <- c(C5$mean,C7$mean)
Min <- c(C5$a,C7$a)
Max <- c(C5$b,C7$b)
dat3 <- data.frame(Name,Mean,Min,Max)
dat3
## Name Mean Min Max
## 1 age==5 364.0497 362.9725 365.1270
## 2 age==7 402.8783 401.2833 404.4733
Suppose that the variance is unknown, whether the length of the sample is equal to the average length.
The following questions are based on the anscombe data in \(\Re\).
(2 points)library(ggplot2)
library(magrittr)
library(dplyr)
library(gridExtra)
##
## 载入程辑包:'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(data = anscombe) +
geom_point(aes(x = x1, y = y1)) +
xlab(bquote(x[1])) +
ylab(bquote(y[1])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x1, y1))[1])) +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(data = anscombe) +
geom_point(aes(x = x2, y = y2)) +
xlab(bquote(x[2])) +
ylab(bquote(y[2])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x2, y2))[1])) +
theme(plot.title = element_text(hjust = 0.5))
p3 <- ggplot(data = anscombe) +
geom_point(aes(x = x3, y = y3)) +
xlab(bquote(x[3])) +
ylab(bquote(y[3])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x3, y3))[1])) +
theme(plot.title = element_text(hjust = 0.5))
p4 <- ggplot(data = anscombe) +
geom_point(aes(x = x4, y = y4)) +
xlab(bquote(x[4])) +
ylab(bquote(y[4])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x4, y4))[1])) +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,p3,p4,ncol = 2,nrow = 2)
\(y_1 \sim x_1\)
\(y_2 \sim x_2\)
\(y_3 \sim x_3\)
\(y_4 \sim x_4\)
using the command lm. Verify that all the fitted models have the exact same coefficients (up to numerical tolerance). (2 points)
lm1 <- lm(y1~x1,data = anscombe)
lm2 <- lm(y2~x2,data = anscombe)
lm3 <- lm(y3~x3,data = anscombe)
lm4 <- lm(y4~x4,data = anscombe)
summary(lm1)
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## x1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(lm2)
##
## Call:
## lm(formula = y2 ~ x2, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## x2 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(lm3)
##
## Call:
## lm(formula = y3 ~ x3, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## x3 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(lm4)
##
## Call:
## lm(formula = y4 ~ x4, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## x4 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
(2 points)c1 <- cor(anscombe$y1,anscombe$x1)
c2 <- cor(anscombe$y2,anscombe$x2)
c3 <- cor(anscombe$y3,anscombe$x3)
c4 <- cor(anscombe$y4,anscombe$x4)
Name <- c('lm1','lm2','lm3','lm4')
Cor <- c(c1,c2,c3,c4)
dat1 <- data.frame(Name,Cor)
dat1
## Name Cor
## 1 lm1 0.8164205
## 2 lm2 0.8162365
## 3 lm3 0.8162867
## 4 lm4 0.8165214
(2 points)lm5 <- lm(x1~y1,data = anscombe)
lm6 <- lm(x2~y2,data = anscombe)
lm7 <- lm(x3~y3,data = anscombe)
lm8 <- lm(x4~y4,data = anscombe)
summary(lm5)
##
## Call:
## lm(formula = x1 ~ y1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6522 -1.5117 -0.2657 1.2341 3.8946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9975 2.4344 -0.410 0.69156
## y1 1.3328 0.3142 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(lm6)
##
## Call:
## lm(formula = x2 ~ y2, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8516 -1.4315 -0.3440 0.8467 4.2017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9948 2.4354 -0.408 0.69246
## y2 1.3325 0.3144 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.02 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(lm7)
##
## Call:
## lm(formula = x3 ~ y3, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9869 -1.3733 -0.0266 1.3200 3.2133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0003 2.4362 -0.411 0.69097
## y3 1.3334 0.3145 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(lm8)
##
## Call:
## lm(formula = x4 ~ y4, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7859 -1.4122 -0.1853 1.4551 3.3329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0036 2.4349 -0.412 0.68985
## y4 1.3337 0.3143 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
Results are different.
(2 points)\[SSE=\sum_{i=1}^n(y_i-\hat{y_i})^2\] \[SST=\sum_{i=1}^n(y_i-\bar{y_i})^2\] \[ R^2=1-\frac{SSE}{SST} \]
lm1 <- lm(y1~x1,data = anscombe)
lm2 <- lm(y2~x2,data = anscombe)
lm3 <- lm(y3~x3,data = anscombe)
lm4 <- lm(y4~x4,data = anscombe)
SSE1 <- sum(resid(lm1)^2)
SST1 <- sum((anscombe$y1-mean(anscombe$y1))^2)
R1 <- 1-SSE1/SST1
SSE2 <- sum(resid(lm2)^2)
SST2 <- sum((anscombe$y2-mean(anscombe$y2))^2)
R2 <- 1-SSE2/SST2
SSE3 <- sum(resid(lm3)^2)
SST3 <- sum((anscombe$y3-mean(anscombe$y3))^2)
R3 <- 1-SSE3/SST3
SSE4 <- sum(resid(lm4)^2)
SST4 <- sum((anscombe$y4-mean(anscombe$y4))^2)
R4 <- 1-SSE4/SST4
Name <- c('lm1','lm2','lm3','lm4')
SSE <- c(SSE1,SSE2,SSE3,SSE4)
SST <- c(SST1,SST2,SST3,SST4)
RF <- c(R1,R2,R3,R4)
dat2 <- data.frame(Name,SSE,SST,RF)
dat2
## Name SSE SST RF
## 1 lm1 13.76269 41.27269 0.6665425
## 2 lm2 13.77629 41.27629 0.6662420
## 3 lm3 13.75619 41.22620 0.6663240
## 4 lm4 13.74249 41.23249 0.6667073
# Another way to calculate the residuals
# resid(lm1) <- predict(lm1, anscombe["x1"])-anscombe$y1
# SSE1 <- sum((predict(lm1, anscombe["x1"])-anscombe$y1)^2)
(2 points)library(ggplot2)
library(magrittr)
library(dplyr)
library(gridExtra)
lm1 <- lm(y1~x1,data = anscombe)
lm2 <- lm(y2~x2,data = anscombe)
lm3 <- lm(y3~x3,data = anscombe)
lm4 <- lm(y4~x4,data = anscombe)
p1 <- ggplot(data = anscombe) +
geom_point(aes(x = x1, y = y1)) +
xlab(bquote(x[1])) +
ylab(bquote(y[1])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x1, y1))[1])) +
geom_abline(slope = 0.5001, intercept = 3.0001, color='red', size=1) +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(data = anscombe) +
geom_point(aes(x = x2, y = y2)) +
xlab(bquote(x[2])) +
ylab(bquote(y[2])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x2, y2))[1])) +
geom_abline(slope = 0.500, intercept = 3.001, color='blue', size=1) +
theme(plot.title = element_text(hjust = 0.5))
p3 <- ggplot(data = anscombe) +
geom_point(aes(x = x3, y = y3)) +
xlab(bquote(x[3])) +
ylab(bquote(y[3])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x3, y3))[1])) +
geom_abline(slope = 0.4997, intercept = 3.0025, color='green', size=1) +
theme(plot.title = element_text(hjust = 0.5))
p4 <- ggplot(data = anscombe) +
geom_point(aes(x = x4, y = y4)) +
xlab(bquote(x[4])) +
ylab(bquote(y[4])) +
ggtitle(paste0("n=", dim(anscombe %>% select(x4, y4))[1])) +
geom_abline(slope = 0.4999, intercept = 3.0017, color='pink', size=1) +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,p3,p4,ncol = 2,nrow = 2)