DATA 501 HW 1

Question 2

# 1
library(rvest)

## Warning: package 'rvest' was built under R version 4.2.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

url = "http://cdec.water.ca.gov/cgi-progs/precip1/8STATIONHIST"
webpage = readLines(url)[476:593]
ind_of_rows_with_space = which(webpage == webpage[2])
webpage = webpage[-ind_of_rows_with_space]
# remove white space in each line
webpage = lapply(as.list(webpage), function(x){trimws(x)}) %>% unlist()
library(stringr)
temp = lapply(as.list(webpage), function(x){strsplit(x, " ", fixed = TRUE)}) %>% unlist()
temp = temp[-which(temp == temp[2])]
df = matrix(temp, ncol = 14, byrow = TRUE)

## Warning in matrix(temp, ncol = 14, byrow = TRUE): data length [310] is not a
## sub-multiple or multiple of the number of rows [23]

## Warning in matrix(temp, ncol = 14, byrow = TRUE): data length [310] is not a
## sub-multiple or multiple of the number of rows [23]
col.names.df = df[1, ]
df = df[-1, ] %>% data.frame()
colnames(df) = col.names.df
df = apply(df, 2, function(x){x = as.numeric(x)}) %>% data.frame()

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): NAs introduced by coercion

write.csv(df, file = "rainfall.csv", row.names = FALSE)


url_groundhog = 'http://people.math.binghamton.edu/qiao/data501/data/groundhog.table'
url_rainfall = 'http://people.math.binghamton.edu/qiao/data501/data/rainfall.csv'
groundhog <- read.csv(url(url_groundhog))
rainfall <- read.csv(url(url_rainfall))
names(rainfall)[1] <- "year" # change the name of the first column so that left_join below
# knows how to merge the two tables.
merged <- left_join(groundhog,rainfall)

## Joining with `by = join_by(year)`

merged$mean_rain = merged$Total/12
ggplot(data = merged, aes(y=mean_rain, x =shadow, color=shadow)) + geom_boxplot()

Question 2 part 2

shadow <- merged[merged$shadow == "Y",]
no_shadow <- merged[merged$shadow == "N",]
t.test(shadow$mean_rain, no_shadow$mean_rain, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  shadow$mean_rain and no_shadow$mean_rain
## t = -0.54731, df = 19, p-value = 0.5905
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.946150  1.139317
## sample estimates:
## mean of x mean of y 
##  4.297917  4.701333

Question 2 part 4

feb_shadow <- shadow %>% select(Feb, mean_rain)
feb_no_shadow <- no_shadow %>% select(Feb, mean_rain)

t.test(feb_shadow$mean_rain, feb_no_shadow$mean_rain, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  feb_shadow$mean_rain and feb_no_shadow$mean_rain
## t = -0.54731, df = 19, p-value = 0.5905
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.946150  1.139317
## sample estimates:
## mean of x mean of y 
##  4.297917  4.701333

# Fail to reject null at .05 alpha level due to p-value being 0.5905 and 0 being in the confidence interval

Question 3 Part 1

library(alr4)

## Warning: package 'alr4' was built under R version 4.2.3

## Loading required package: car

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## Loading required package: effects

## Warning: package 'effects' was built under R version 4.2.3

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(magrittr)

## Warning: package 'magrittr' was built under R version 4.2.3

head(walleye)

walleye_sub = walleye %>% filter(age >= 5) %>% filter(age <= 8)
walleye_sub$age = factor(walleye_sub$age)

ggplot(walleye_sub, aes(y = length, x = age, color = age)) + geom_boxplot(fill = "orange")

Question 3 part 2

walleye_sub %>% group_by(age) %>% summarise(average = mean(length), standard_dev = sd(length))

Question 3 part 3

ggplot(walleye_sub, aes(x = length)) + geom_histogram() + facet_wrap(~ age)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 3 Part 4

age5 = walleye %>% filter(age == 5) %>% .$length
age7 = walleye %>% filter(age == 7) %>% .$length
t.test(age5, age7)$conf.int

## [1] -44.37574 -33.28132
## attr(,"conf.level")
## [1] 0.95

Question 3 Part 5

t.test(age5, age7, conf.level = .9)

## 
##  Welch Two Sample t-test
## 
## data:  age5 and age7
## t = -13.772, df = 316.61, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 90 percent confidence interval:
##  -43.47971 -34.17735
## sample estimates:
## mean of x mean of y 
##  364.0497  402.8783

Question 3 Part 6

age57 = walleye %>% filter(age == c(5,7)) %>% .[,c("age","length")]
age57$age = factor(age57$age)
summary(lm(data = age57, length ~ age))

## 
## Call:
## lm(formula = length ~ age, data = age57)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.776 -18.275  -2.989  17.538  78.658 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  364.762      2.203 165.597   <2e-16 ***
## age7          36.259      3.931   9.225   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.38 on 240 degrees of freedom
## Multiple R-squared:  0.2618, Adjusted R-squared:  0.2587 
## F-statistic:  85.1 on 1 and 240 DF,  p-value: < 2.2e-16

Question 4 part 1

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.2.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

head(anscombe)

p1 <- ggplot(data = anscombe) +
geom_point(aes(x = x1, y = y1)) +
xlab(bquote(x[1])) +
ylab(bquote(y[1])) +
ggtitle(bquote(x[1]~"versus"~y[1])) +
theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(data = anscombe) +
geom_point(aes(x = x2, y = y2)) +
xlab(bquote(x[2])) +
ylab(bquote(y[2])) +
ggtitle(bquote(x[2]~"versus"~y[2])) +
theme(plot.title = element_text(hjust = 0.5))

p3 <- ggplot(data = anscombe) +
geom_point(aes(x = x3, y = y3)) +
xlab(bquote(x[3])) +
ylab(bquote(y[3])) +
ggtitle(bquote(x[3]~"versus"~y[3])) +
theme(plot.title = element_text(hjust = 0.5))

p4 <- ggplot(data = anscombe) +
geom_point(aes(x = x4, y = y4)) +
xlab(bquote(x[4])) +
ylab(bquote(y[4])) +
ggtitle(bquote(x[4]~"versus"~y[4])) +
theme(plot.title = element_text(hjust = 0.5))


grid.arrange(p1, p2, p3, p4, nrow = 2)

Question 4 Part 2

model1 <- lm(data = anscombe, y1 ~ x1)
summary(model1)

## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92127 -0.45577 -0.04136  0.70941  1.83882 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0001     1.1247   2.667  0.02573 * 
## x1            0.5001     0.1179   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217

model2 <- lm(data = anscombe, y2 ~ x2)
summary(model2)

## 
## Call:
## lm(formula = y2 ~ x2, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9009 -0.7609  0.1291  0.9491  1.2691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.001      1.125   2.667  0.02576 * 
## x2             0.500      0.118   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179

model3 <- lm(data = anscombe, y3 ~ x3)
summary(model3)

## 
## Call:
## lm(formula = y3 ~ x3, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1586 -0.6146 -0.2303  0.1540  3.2411 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0025     1.1245   2.670  0.02562 * 
## x3            0.4997     0.1179   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176

model4 <- lm(data = anscombe, y4 ~ x4)
summary(model4)

## 
## Call:
## lm(formula = y4 ~ x4, data = anscombe)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.751 -0.831  0.000  0.809  1.839 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0017     1.1239   2.671  0.02559 * 
## x4            0.4999     0.1178   4.243  0.00216 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6667, Adjusted R-squared:  0.6297 
## F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165

Question 4 Part 3

attach(anscombe)
cor(x1,y1)

## [1] 0.8164205

cor(x2,y2)

## [1] 0.8162365

cor(x3,y3)

## [1] 0.8162867

cor(x4,y4)

## [1] 0.8165214

Question 4 Part 4

model1 <- lm(data = anscombe, x1 ~ y1)
summary(model1)

## 
## Call:
## lm(formula = x1 ~ y1, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6522 -1.5117 -0.2657  1.2341  3.8946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.9975     2.4344  -0.410  0.69156   
## y1            1.3328     0.3142   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217

model2 <- lm(data = anscombe, x2 ~ y2)
summary(model2)

## 
## Call:
## lm(formula = x2 ~ y2, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8516 -1.4315 -0.3440  0.8467  4.2017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.9948     2.4354  -0.408  0.69246   
## y2            1.3325     0.3144   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.02 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179

model3 <- lm(data = anscombe, x3 ~ y3)
summary(model3)

## 
## Call:
## lm(formula = x3 ~ y3, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9869 -1.3733 -0.0266  1.3200  3.2133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -1.0003     2.4362  -0.411  0.69097   
## y3            1.3334     0.3145   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176

model4 <- lm(data = anscombe, x4 ~ y4)
summary(model4)

## 
## Call:
## lm(formula = x4 ~ y4, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7859 -1.4122 -0.1853  1.4551  3.3329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -1.0036     2.4349  -0.412  0.68985   
## y4            1.3337     0.3143   4.243  0.00216 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.018 on 9 degrees of freedom
## Multiple R-squared:  0.6667, Adjusted R-squared:  0.6297 
## F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165

Question 5 Part 5

# x1 ~ y1
attach(anscombe)

## The following objects are masked from anscombe (pos = 3):
## 
##     x1, x2, x3, x4, y1, y2, y3, y4

cor1 <- cor(x1,y1)
SSE1 = sum((anscombe$y1 - residuals(model1))^2)
SST1 = SSE1*(1 - cor1^2)
SSR1 = (cor1^2)*SST1
print(paste(SSE1, SST1, SSR1))

## [1] "696.853029454035 232.370897285602 154.885069395006"

cor2 <- cor(x2,y2)
SSE2 = sum((anscombe$y2 - residuals(model2))^2)
SST2 = SSE2*(1 - cor2^2)
SSR2 = (cor2^2)*SST2
print(paste(SSE2, SST2, SSR2))

## [1] "696.889676289977 232.592481074854 154.962887621032"

cor3 <- cor(x3,y3)
SSE3 = sum((anscombe$y3 - residuals(model3))^2)
SST3 = SSE3*(1 - cor3^2)
SSR3 = (cor3^2)*SST3
print(paste(SSE3, SST3, SSR3))

## [1] "696.680555482679 232.465552420965 154.897386297907"

cor4 <- cor(x4,y4)
SSE4 = sum((anscombe$y4 - residuals(model4))^2)
SST4 = SSE4*(1 - cor4^2)
SSR4 = (cor4^2)*SST4
print(paste(SSE4, SST4, SSR4))

## [1] "696.794701741169 232.23661752193 154.833838219424"

Question 4 Part 6

library(ggplot2)
library(gridExtra)

p1 <- ggplot(anscombe, aes(x = x1, y = y1)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab(bquote(x[1])) + ylab(bquote(y[1])) +
  ggtitle(bquote(x[1]~"versus"~y[1])) +
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(anscombe, aes(x = x2, y = y2)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab(bquote(x[2])) + ylab(bquote(y[2])) +
  ggtitle(bquote(x[2]~"versus"~y[2])) +
  theme(plot.title = element_text(hjust = 0.5))

p3 <- ggplot(anscombe, aes(x = x3, y = y3)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab(bquote(x[3])) + ylab(bquote(y[3])) +
  ggtitle(bquote(x[3]~"versus"~y[3])) +
  theme(plot.title = element_text(hjust = 0.5))

p4 <- ggplot(anscombe, aes(x = x4, y = y4)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab(bquote(x[4])) + ylab(bquote(y[4])) +
  ggtitle(bquote(x[4]~"versus"~y[4])) +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1, p2, p3, p4, nrow = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Question 5 part 1, 11

tomasetti = read.csv("http://people.math.binghamton.edu/qiao/data501/data/Tomasetti.csv")
head(tomasetti)

tomasetti$log_risk = log(tomasetti$Risk, base = 10)
tomasetti$log_lscd = log(tomasetti$Lscd, base = 10)
tomasetti_model <- lm(data = tomasetti, log_risk ~ log_lscd)
summary(tomasetti_model)

## 
## Call:
## lm(formula = log_risk ~ log_lscd, data = tomasetti)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65086 -0.46562  0.06228  0.43188  1.21052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.61049    0.72282  -10.53 2.02e-11 ***
## log_lscd     0.53260    0.07316    7.28 5.12e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7491 on 29 degrees of freedom
## Multiple R-squared:  0.6463, Adjusted R-squared:  0.6341 
## F-statistic:    53 on 1 and 29 DF,  p-value: 5.117e-08

Question 5 part 2 and 3

predict_band <- as.data.frame(predict(tomasetti_model, interval = "prediction", level = 0.95))

## Warning in predict.lm(tomasetti_model, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses

tomasetti_pred <- cbind(tomasetti, predict_band)
ggplot(tomasetti_pred, aes(x = log_lscd, y = log_risk)) + geom_point() +
  geom_line(aes(y = fit), color = "blue") +   # line of best fit
  geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "grey", alpha = 0.5) +
  xlab("Log of LSCD") +
  ylab("Log of RISK") +
  ggtitle("Log of LSCD vs. Log of Risk")

Question 5 part 4

predict(tomasetti_model, newdata = data.frame(log_lscd = log(10^10)), interval = "prediction", level = 0.95)

##        fit     lwr      upr
## 1 4.653053 2.12439 7.181717

Question 5 part 5

conf_band <- as.data.frame(predict(tomasetti_model, interval = "confidence", level = 0.95))
tomasetti_conf <- cbind(tomasetti, conf_band)
ggplot(tomasetti_conf, aes(x = log_lscd, y = log_risk)) + geom_point() +
  geom_line(aes(y = fit), color = "blue") +   # line of best fit
  geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "grey", alpha = 0.5) +
  xlab("Log of LSCD") +
  ylab("Log of RISK") +
  ggtitle("Log of LSCD vs. Log of Risk")

Question 5 part 6

predict(tomasetti_model, newdata = data.frame(log_lscd = log(10^10)), interval = "confidence", level = 0.95)

##        fit     lwr      upr
## 1 4.653053 2.64134 6.664767

Question 5 part 9

confint(tomasetti_model, level = 0.95)

##                  2.5 %     97.5 %
## (Intercept) -9.0888260 -6.1321478
## log_lscd     0.3829708  0.6822268

Question 5 part 13

tomasetti_model <- lm(data = tomasetti, log_risk ~ log_lscd)
summary(tomasetti_model)

## 
## Call:
## lm(formula = log_risk ~ log_lscd, data = tomasetti)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65086 -0.46562  0.06228  0.43188  1.21052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.61049    0.72282  -10.53 2.02e-11 ***
## log_lscd     0.53260    0.07316    7.28 5.12e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7491 on 29 degrees of freedom
## Multiple R-squared:  0.6463, Adjusted R-squared:  0.6341 
## F-statistic:    53 on 1 and 29 DF,  p-value: 5.117e-08

residual_std_error = 0.7491
var_est = residual_std_error^2
var_est

## [1] 0.5611508

Question 6 part 1, 2 & 4

survey <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P054.txt", header = TRUE, sep = "\t")
head(survey)

ggplot(data = survey, aes(x = Daily, y = Sunday)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Daily Circulations versus Sunday Circulations")

## `geom_smooth()` using formula = 'y ~ x'

Question 6 part 5

circulation_model <- lm(data = survey, Sunday ~ Daily)
anova(circulation_model)

qf(.95, 1, 32)

## [1] 4.149097

Question 6 part 7

# R^2 = F/(n-2+F)
F_value = 358.52
n = 34
R = F_value/(n-2+F_value)
R

## [1] 0.918058