rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("hw4_task1.csv")
#Mean and SD for x and y by group
stats <- data %>% #I use pipe operator and groupby fn of dplyr to
#deal with separating groups and doing the analysis,I used the same in
#the 'choose any package and tweak it' HW problem as well.
group_by(group) %>%
summarise(
x_mean = mean(x),
x_sd = sd(x),
y_mean = mean(y),
y_sd = sd(y)
)
print(stats)
## # A tibble: 12 × 5
## group x_mean x_sd y_mean y_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 A 54.3 16.8 47.8 26.9
## 2 B 54.3 16.8 47.8 26.9
## 3 C 54.3 16.8 47.8 26.9
## 4 D 54.3 16.8 47.8 26.9
## 5 E 54.3 16.8 47.8 26.9
## 6 F 54.3 16.8 47.8 26.9
## 7 G 54.3 16.8 47.8 26.9
## 8 H 54.3 16.8 47.8 26.9
## 9 I 54.3 16.8 47.8 26.9
## 10 J 54.3 16.8 47.8 26.9
## 11 K 54.3 16.8 47.8 26.9
## 12 L 54.3 16.8 47.8 26.9
# Correlation between x and y for each group
correlations <- data %>%
group_by(group) %>%
summarise(correlation = cor(x, y))
print(correlations)
## # A tibble: 12 × 2
## group correlation
## <chr> <dbl>
## 1 A -0.0645
## 2 B -0.0641
## 3 C -0.0617
## 4 D -0.0694
## 5 E -0.0656
## 6 F -0.0630
## 7 G -0.0685
## 8 H -0.0603
## 9 I -0.0666
## 10 J -0.0686
## 11 K -0.0686
## 12 L -0.0690
#Scatterplots in rows of three
ggplot(data, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ group, ncol = 4) +
#I used facet_wrap of ggplot2 which works in the same way as par(mfrow=c(,))in base R
labs(title = "Scatterplots of y vs x by Group")
Observations: All groups show very weak Neg correlations (around -0.06), Correlation values are remarkably consistent across all 12 groups, All correlations are close to zero, indicating no meaningful linear relationship, The strongest correlation is Group H (-0.0603), weakest is Group C (-0.0617), Range is very narrow: only 0.0094 difference between min and max correlation, A looks like a dinosaur, B looks like a firework, C looks like 5 distinct horizontal straight lines, D looks like 4 distinct vertical straight lines,E looks like X, F looks like a star, G looks like two separate horizontal clusters, H looks like 9 points each point represents one cluster, I looks like two vertical clusters, J looks like a ring, K looks like tilted straight lines (from south west to north east), L looks like tilted straight lines (from south east to north west)
rm(list=ls())
par(mfrow = c(1, 1))
data(rock)
head(rock)
## area peri shape perm
## 1 4990 2791.90 0.0903296 6.3
## 2 7002 3892.60 0.1486220 6.3
## 3 7558 3930.66 0.1833120 6.3
## 4 7352 3869.32 0.1170630 6.3
## 5 7943 3948.54 0.1224170 17.1
## 6 7979 4010.15 0.1670450 17.1
#linear regression
model <- lm(peri ~ area, data = rock)
summary(model)
##
## Call:
## lm(formula = peri ~ area, data = rock)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2306.8 -502.3 122.5 564.5 1291.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -471.43579 342.77487 -1.375 0.176
## area 0.43875 0.04473 9.808 7.51e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 823.1 on 46 degrees of freedom
## Multiple R-squared: 0.6765, Adjusted R-squared: 0.6695
## F-statistic: 96.2 on 1 and 46 DF, p-value: 7.506e-13
#studentized residuals
rock$student_resid <- rstudent(model)
#I used quantiles for balanced groups(3 groups)
rock$resid_group <- cut(abs(rock$student_resid),
breaks = quantile(abs(rock$student_resid),
probs = c(0, 0.33, 0.67, 1)),
labels = c("Low", "Medium", "High"))
#scatterplot
plot(rock$area, rock$peri,
col = c("green", "blue", "red")[rock$resid_group],
pch = c(16, 17, 18)[rock$resid_group],
xlab = "Area", ylab = "Perimeter",
main = "Rock Samples: Perimeter vs Area")
#regression line
abline(model, col = "darkgreen", lwd = 2)
legend("topleft",
legend = c(levels(rock$resid_group), "Regression Line"),
col = c("green", "blue", "red", "darkgreen"),
pch = c(16, 17, 18, NA),
lty = c(NA, NA, NA, 1),
cex=0.6)
set.seed(123)
#1000 samples from each distribution
normal_samples <- rnorm(1000, mean = -2, sd = 1)
gamma1_samples <- rgamma(1000, shape = 2, rate = 0.5)
gamma2_samples <- rgamma(1000, shape = 1, rate = 1)
#Side-by-side boxplots
boxplot(normal_samples, gamma1_samples, gamma2_samples,
names = c("N(-2,1)", "Gamma(2,0.5)", "Gamma(1,1)"),
main = "Side-by-Side Boxplots",
ylab = "Values")
#Overlayed histograms
hist(normal_samples, col = rgb(1, 0, 0, 0.5),
xlim = c(-5, 15), ylim = c(0, 800),
main = "Overlayed Histograms",
xlab = "Values", ylab = "Frequency")
hist(gamma1_samples, col = rgb(0, 1, 0, 0.5), add = TRUE)
hist(gamma2_samples, col = rgb(0, 0, 1, 0.5), add = TRUE)
legend("topright",
legend = c("N(-2,1)", "Gamma(2,0.5)", "Gamma(1,1)"),
fill = c(rgb(1,0,0,0.5), rgb(0,1,0,0.5), rgb(0,0,1,0.5)))
library(mvtnorm)
# First distribution: y follows MVN(mu=[0,0], sigma=[[9,0],[0,9]])
x <- seq(-10, 10, length = 100)
y <- seq(-10, 10, length = 100)
grid <- expand.grid(x = x, y = y)
#PDF values for first distribution
mu1 <- c(0, 0)
sigma1 <- matrix(c(9, 0, 0, 9), nrow = 2)
z1 <- matrix(dmvnorm(grid, mean = mu1, sigma = sigma1), nrow = length(x))
#3D surface plot for first distribution
persp(x, y, z1,
xlab = "y1", ylab = "y2", zlab = "Density",
main = "Bivariate Normal: mu=[0,0], sigma=[[9,0],[0,9]]",
theta = 30, phi = 30, col = "lightblue")
#Heatmap with contours for first distribution
filled.contour(x, y, z1,
xlab = "y1", ylab = "y2",
main = "Heatmap: mu=[0,0], sigma=[[9,0],[0,9]]")
# Second distribution: w ~ MVN(mu=[1,2], sigma=[[9,-8],[-8,9]])
#PDF values for second distribution
mu2 <- c(1, 2)
sigma2 <- matrix(c(9, -8, -8, 9), nrow = 2)
z2 <- matrix(dmvnorm(grid, mean = mu2, sigma = sigma2), nrow = length(x))
# 3D surface plot for second distribution
persp(x, y, z2,
xlab = "w1", ylab = "w2", zlab = "Density",
main = "Bivariate Normal: mu=[1,2], sigma=[[9,-8],[-8,9]]",
theta = 30, phi = 30, col = "lightgreen")
#Heatmap with contours for second distribution
filled.contour(x, y, z2,
xlab = "w1", ylab = "w2",
main = "Heatmap: mu=[1,2], sigma=[[9,-8],[-8,9]]")