install.packages(“rmarkdown”)

# sets wd to the path on my computer; 
setwd("C:\\Users\\hmon1\\Desktop\\10C Homework\\") #this is where you downloaded the HW1.csv file
# loads in data for the full population
pop<-read.csv("HW21.csv")
names(pop) <- c("G", "X", "Y")
# sets the seed for the random number generator
set.seed(48183130)  #use your student ID instead of 12345678
# assigns a "random" sample of 10 from group A and 5 from group B from the population
G_A<-pop[sample(nrow(pop), 10, pop$G == "A", replace=FALSE),]
G_B<-pop[sample(nrow(pop), 5, pop$G == "B", replace=FALSE),]
# use this data
data <- rbind(G_A, G_B)
data
##     G  X  Y
## 421 A 10 10
## 659 A 65 65
## 219 A 70 80
## 800 A 40 30
## 187 A 35 35
## 542 A 65 65
## 274 A 60 70
## 773 A 20 30
## 94  A 70 60
## 841 A 20 30
## 544 B 25 40
## 576 B 25 60
## 178 B 25 60
## 472 B 35 45
## 312 B 25 40
plot(data$X, data$Y, main=c(paste("Scatterplot")), xlab="X", ylab="Y", pch = c(1,3)[as.factor(data$G)])
legend("bottomright", pch=c(1,3), c("A","B"), bty="o", cex=.8)

# regression for A and B combined
model_full <- lm(Y ~ X, data=data)
summary(model_full)
## 
## Call:
## lm(formula = Y ~ X, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.485  -6.897  -1.660   4.698  22.421 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.4037     7.0565   2.750 0.016545 *  
## X             0.7270     0.1597   4.554 0.000542 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.46 on 13 degrees of freedom
## Multiple R-squared:  0.6146, Adjusted R-squared:  0.585 
## F-statistic: 20.73 on 1 and 13 DF,  p-value: 0.0005416
# Pearson's r for A and B combined
r_full <- cor(data$X, data$Y)
r_full
## [1] 0.7839902
# regression for A
model_A <- lm(Y ~ X, data=G_A)
summary(model_A)
## 
## Call:
## lm(formula = Y ~ X, data = G_A)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3499  -3.8610  -0.7594   6.3776   9.5587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.8948     5.9538   0.822    0.435    
## X             0.9364     0.1177   7.954 4.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.218 on 8 degrees of freedom
## Multiple R-squared:  0.8877, Adjusted R-squared:  0.8737 
## F-statistic: 63.26 on 1 and 8 DF,  p-value: 4.553e-05
# Pearson's r for A
r_A <- cor(G_A$X, G_A$Y)
r_A
## [1] 0.9421967
# regression for B
model_B <- lm(Y ~ X, data=G_B)
summary(model_B)
## 
## Call:
## lm(formula = Y ~ X, data = G_B)
## 
## Residuals:
##        544        576        178        472        312 
## -1.000e+01  1.000e+01  1.000e+01 -4.441e-15 -1.000e+01 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   62.500     35.237   1.774    0.174
## X             -0.500      1.291  -0.387    0.724
## 
## Residual standard error: 11.55 on 3 degrees of freedom
## Multiple R-squared:  0.04762,    Adjusted R-squared:  -0.2698 
## F-statistic:  0.15 on 1 and 3 DF,  p-value: 0.7244
# Person's r for B
r_B <- cor(G_B$X, G_B$Y)
r_B
## [1] -0.2182179
# creates plot
plot(data$X, data$Y, main=c(paste("Scatterplot by Subgroups")), xlab="X", ylab="Y", pch = c(1,3)[as.factor(data$G)])
abline(model_full)
abline(model_A, lty = "dashed")
abline(model_B, lty = "dotted")
legend("bottomright", pch=c(1,3), c("A","B"), bty="o", cex=.8)