Question 2
# 1
library(rvest)
## Warning: package 'rvest' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
url = "http://cdec.water.ca.gov/cgi-progs/precip1/8STATIONHIST"
webpage = readLines(url)[476:593]
ind_of_rows_with_space = which(webpage == webpage[2])
webpage = webpage[-ind_of_rows_with_space]
# remove white space in each line
webpage = lapply(as.list(webpage), function(x){trimws(x)}) %>% unlist()
library(stringr)
temp = lapply(as.list(webpage), function(x){strsplit(x, " ", fixed = TRUE)}) %>% unlist()
temp = temp[-which(temp == temp[2])]
df = matrix(temp, ncol = 14, byrow = TRUE)
## Warning in matrix(temp, ncol = 14, byrow = TRUE): data length [310] is not a
## sub-multiple or multiple of the number of rows [23]
## Warning in matrix(temp, ncol = 14, byrow = TRUE): data length [310] is not a
## sub-multiple or multiple of the number of rows [23]
col.names.df = df[1, ]
df = df[-1, ] %>% data.frame()
colnames(df) = col.names.df
df = apply(df, 2, function(x){x = as.numeric(x)}) %>% data.frame()
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
write.csv(df, file = "rainfall.csv", row.names = FALSE)
url_groundhog = 'http://people.math.binghamton.edu/qiao/data501/data/groundhog.table'
url_rainfall = 'http://people.math.binghamton.edu/qiao/data501/data/rainfall.csv'
groundhog <- read.csv(url(url_groundhog))
rainfall <- read.csv(url(url_rainfall))
names(rainfall)[1] <- "year" # change the name of the first column so that left_join below
# knows how to merge the two tables.
merged <- left_join(groundhog,rainfall)
## Joining with `by = join_by(year)`
merged$mean_rain = merged$Total/12
ggplot(data = merged, aes(y=mean_rain, x =shadow, color=shadow)) + geom_boxplot()
Question 2 part 2
shadow <- merged[merged$shadow == "Y",]
no_shadow <- merged[merged$shadow == "N",]
t.test(shadow$mean_rain, no_shadow$mean_rain, var.equal = TRUE)
##
## Two Sample t-test
##
## data: shadow$mean_rain and no_shadow$mean_rain
## t = -0.54731, df = 19, p-value = 0.5905
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.946150 1.139317
## sample estimates:
## mean of x mean of y
## 4.297917 4.701333
Question 2 part 4
feb_shadow <- shadow %>% select(Feb, mean_rain)
feb_no_shadow <- no_shadow %>% select(Feb, mean_rain)
t.test(feb_shadow$mean_rain, feb_no_shadow$mean_rain, var.equal = TRUE)
##
## Two Sample t-test
##
## data: feb_shadow$mean_rain and feb_no_shadow$mean_rain
## t = -0.54731, df = 19, p-value = 0.5905
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.946150 1.139317
## sample estimates:
## mean of x mean of y
## 4.297917 4.701333
# Fail to reject null at .05 alpha level due to p-value being 0.5905 and 0 being in the confidence interval
Question 3 Part 1
library(alr4)
## Warning: package 'alr4' was built under R version 4.2.3
## Loading required package: car
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: effects
## Warning: package 'effects' was built under R version 4.2.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.2.3
head(walleye)
walleye_sub = walleye %>% filter(age >= 5) %>% filter(age <= 8)
walleye_sub$age = factor(walleye_sub$age)
ggplot(walleye_sub, aes(y = length, x = age, color = age)) + geom_boxplot(fill = "orange")
Question 3 part 2
walleye_sub %>% group_by(age) %>% summarise(average = mean(length), standard_dev = sd(length))
Question 3 part 3
ggplot(walleye_sub, aes(x = length)) + geom_histogram() + facet_wrap(~ age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 3 Part 4
age5 = walleye %>% filter(age == 5) %>% .$length
age7 = walleye %>% filter(age == 7) %>% .$length
t.test(age5, age7)$conf.int
## [1] -44.37574 -33.28132
## attr(,"conf.level")
## [1] 0.95
Question 3 Part 5
t.test(age5, age7, conf.level = .9)
##
## Welch Two Sample t-test
##
## data: age5 and age7
## t = -13.772, df = 316.61, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 90 percent confidence interval:
## -43.47971 -34.17735
## sample estimates:
## mean of x mean of y
## 364.0497 402.8783
Question 3 Part 6
age57 = walleye %>% filter(age == c(5,7)) %>% .[,c("age","length")]
age57$age = factor(age57$age)
summary(lm(data = age57, length ~ age))
##
## Call:
## lm(formula = length ~ age, data = age57)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.776 -18.275 -2.989 17.538 78.658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 364.762 2.203 165.597 <2e-16 ***
## age7 36.259 3.931 9.225 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.38 on 240 degrees of freedom
## Multiple R-squared: 0.2618, Adjusted R-squared: 0.2587
## F-statistic: 85.1 on 1 and 240 DF, p-value: < 2.2e-16
Question 4 part 1
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
head(anscombe)
p1 <- ggplot(data = anscombe) +
geom_point(aes(x = x1, y = y1)) +
xlab(bquote(x[1])) +
ylab(bquote(y[1])) +
ggtitle(bquote(x[1]~"versus"~y[1])) +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(data = anscombe) +
geom_point(aes(x = x2, y = y2)) +
xlab(bquote(x[2])) +
ylab(bquote(y[2])) +
ggtitle(bquote(x[2]~"versus"~y[2])) +
theme(plot.title = element_text(hjust = 0.5))
p3 <- ggplot(data = anscombe) +
geom_point(aes(x = x3, y = y3)) +
xlab(bquote(x[3])) +
ylab(bquote(y[3])) +
ggtitle(bquote(x[3]~"versus"~y[3])) +
theme(plot.title = element_text(hjust = 0.5))
p4 <- ggplot(data = anscombe) +
geom_point(aes(x = x4, y = y4)) +
xlab(bquote(x[4])) +
ylab(bquote(y[4])) +
ggtitle(bquote(x[4]~"versus"~y[4])) +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1, p2, p3, p4, nrow = 2)
Question 4 Part 2
model1 <- lm(data = anscombe, y1 ~ x1)
summary(model1)
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## x1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
model2 <- lm(data = anscombe, y2 ~ x2)
summary(model2)
##
## Call:
## lm(formula = y2 ~ x2, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## x2 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
model3 <- lm(data = anscombe, y3 ~ x3)
summary(model3)
##
## Call:
## lm(formula = y3 ~ x3, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## x3 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
model4 <- lm(data = anscombe, y4 ~ x4)
summary(model4)
##
## Call:
## lm(formula = y4 ~ x4, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## x4 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
Question 4 Part 3
attach(anscombe)
cor(x1,y1)
## [1] 0.8164205
cor(x2,y2)
## [1] 0.8162365
cor(x3,y3)
## [1] 0.8162867
cor(x4,y4)
## [1] 0.8165214
Question 4 Part 4
model1 <- lm(data = anscombe, x1 ~ y1)
summary(model1)
##
## Call:
## lm(formula = x1 ~ y1, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6522 -1.5117 -0.2657 1.2341 3.8946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9975 2.4344 -0.410 0.69156
## y1 1.3328 0.3142 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
model2 <- lm(data = anscombe, x2 ~ y2)
summary(model2)
##
## Call:
## lm(formula = x2 ~ y2, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8516 -1.4315 -0.3440 0.8467 4.2017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9948 2.4354 -0.408 0.69246
## y2 1.3325 0.3144 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.02 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
model3 <- lm(data = anscombe, x3 ~ y3)
summary(model3)
##
## Call:
## lm(formula = x3 ~ y3, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9869 -1.3733 -0.0266 1.3200 3.2133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0003 2.4362 -0.411 0.69097
## y3 1.3334 0.3145 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
model4 <- lm(data = anscombe, x4 ~ y4)
summary(model4)
##
## Call:
## lm(formula = x4 ~ y4, data = anscombe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7859 -1.4122 -0.1853 1.4551 3.3329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0036 2.4349 -0.412 0.68985
## y4 1.3337 0.3143 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
Question 5 Part 5
# x1 ~ y1
attach(anscombe)
## The following objects are masked from anscombe (pos = 3):
##
## x1, x2, x3, x4, y1, y2, y3, y4
cor1 <- cor(x1,y1)
SSE1 = sum((anscombe$y1 - residuals(model1))^2)
SST1 = SSE1*(1 - cor1^2)
SSR1 = (cor1^2)*SST1
print(paste(SSE1, SST1, SSR1))
## [1] "696.853029454035 232.370897285602 154.885069395006"
cor2 <- cor(x2,y2)
SSE2 = sum((anscombe$y2 - residuals(model2))^2)
SST2 = SSE2*(1 - cor2^2)
SSR2 = (cor2^2)*SST2
print(paste(SSE2, SST2, SSR2))
## [1] "696.889676289977 232.592481074854 154.962887621032"
cor3 <- cor(x3,y3)
SSE3 = sum((anscombe$y3 - residuals(model3))^2)
SST3 = SSE3*(1 - cor3^2)
SSR3 = (cor3^2)*SST3
print(paste(SSE3, SST3, SSR3))
## [1] "696.680555482679 232.465552420965 154.897386297907"
cor4 <- cor(x4,y4)
SSE4 = sum((anscombe$y4 - residuals(model4))^2)
SST4 = SSE4*(1 - cor4^2)
SSR4 = (cor4^2)*SST4
print(paste(SSE4, SST4, SSR4))
## [1] "696.794701741169 232.23661752193 154.833838219424"
Question 4 Part 6
library(ggplot2)
library(gridExtra)
p1 <- ggplot(anscombe, aes(x = x1, y = y1)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab(bquote(x[1])) + ylab(bquote(y[1])) +
ggtitle(bquote(x[1]~"versus"~y[1])) +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(anscombe, aes(x = x2, y = y2)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab(bquote(x[2])) + ylab(bquote(y[2])) +
ggtitle(bquote(x[2]~"versus"~y[2])) +
theme(plot.title = element_text(hjust = 0.5))
p3 <- ggplot(anscombe, aes(x = x3, y = y3)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab(bquote(x[3])) + ylab(bquote(y[3])) +
ggtitle(bquote(x[3]~"versus"~y[3])) +
theme(plot.title = element_text(hjust = 0.5))
p4 <- ggplot(anscombe, aes(x = x4, y = y4)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
xlab(bquote(x[4])) + ylab(bquote(y[4])) +
ggtitle(bquote(x[4]~"versus"~y[4])) +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1, p2, p3, p4, nrow = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Question 5 part 1, 11
tomasetti = read.csv("http://people.math.binghamton.edu/qiao/data501/data/Tomasetti.csv")
head(tomasetti)
tomasetti$log_risk = log(tomasetti$Risk, base = 10)
tomasetti$log_lscd = log(tomasetti$Lscd, base = 10)
tomasetti_model <- lm(data = tomasetti, log_risk ~ log_lscd)
summary(tomasetti_model)
##
## Call:
## lm(formula = log_risk ~ log_lscd, data = tomasetti)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65086 -0.46562 0.06228 0.43188 1.21052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.61049 0.72282 -10.53 2.02e-11 ***
## log_lscd 0.53260 0.07316 7.28 5.12e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7491 on 29 degrees of freedom
## Multiple R-squared: 0.6463, Adjusted R-squared: 0.6341
## F-statistic: 53 on 1 and 29 DF, p-value: 5.117e-08
Question 5 part 2 and 3
predict_band <- as.data.frame(predict(tomasetti_model, interval = "prediction", level = 0.95))
## Warning in predict.lm(tomasetti_model, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
tomasetti_pred <- cbind(tomasetti, predict_band)
ggplot(tomasetti_pred, aes(x = log_lscd, y = log_risk)) + geom_point() +
geom_line(aes(y = fit), color = "blue") + # line of best fit
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "grey", alpha = 0.5) +
xlab("Log of LSCD") +
ylab("Log of RISK") +
ggtitle("Log of LSCD vs. Log of Risk")
Question 5 part 4
predict(tomasetti_model, newdata = data.frame(log_lscd = log(10^10)), interval = "prediction", level = 0.95)
## fit lwr upr
## 1 4.653053 2.12439 7.181717
Question 5 part 5
conf_band <- as.data.frame(predict(tomasetti_model, interval = "confidence", level = 0.95))
tomasetti_conf <- cbind(tomasetti, conf_band)
ggplot(tomasetti_conf, aes(x = log_lscd, y = log_risk)) + geom_point() +
geom_line(aes(y = fit), color = "blue") + # line of best fit
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "grey", alpha = 0.5) +
xlab("Log of LSCD") +
ylab("Log of RISK") +
ggtitle("Log of LSCD vs. Log of Risk")
Question 5 part 6
predict(tomasetti_model, newdata = data.frame(log_lscd = log(10^10)), interval = "confidence", level = 0.95)
## fit lwr upr
## 1 4.653053 2.64134 6.664767
Question 5 part 9
confint(tomasetti_model, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -9.0888260 -6.1321478
## log_lscd 0.3829708 0.6822268
Question 5 part 13
tomasetti_model <- lm(data = tomasetti, log_risk ~ log_lscd)
summary(tomasetti_model)
##
## Call:
## lm(formula = log_risk ~ log_lscd, data = tomasetti)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65086 -0.46562 0.06228 0.43188 1.21052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.61049 0.72282 -10.53 2.02e-11 ***
## log_lscd 0.53260 0.07316 7.28 5.12e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7491 on 29 degrees of freedom
## Multiple R-squared: 0.6463, Adjusted R-squared: 0.6341
## F-statistic: 53 on 1 and 29 DF, p-value: 5.117e-08
residual_std_error = 0.7491
var_est = residual_std_error^2
var_est
## [1] 0.5611508
Question 6 part 1, 2 & 4
survey <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P054.txt", header = TRUE, sep = "\t")
head(survey)
ggplot(data = survey, aes(x = Daily, y = Sunday)) + geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Daily Circulations versus Sunday Circulations")
## `geom_smooth()` using formula = 'y ~ x'
Question 6 part 5
circulation_model <- lm(data = survey, Sunday ~ Daily)
anova(circulation_model)
qf(.95, 1, 32)
## [1] 4.149097
Question 6 part 7
# R^2 = F/(n-2+F)
F_value = 358.52
n = 34
R = F_value/(n-2+F_value)
R
## [1] 0.918058