problem1

rm(list=ls())

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)


data <- read.csv("hw4_task1.csv")

#Mean and SD for x and y by group
stats <- data %>%  #I use pipe operator and groupby fn of dplyr to 
  #deal with separating groups and doing the analysis,I used the same in
  #the 'choose any package and tweak it' HW problem as well.
  group_by(group) %>%
  summarise(
    x_mean = mean(x),
    x_sd = sd(x),
    y_mean = mean(y), 
    y_sd = sd(y)
  )
print(stats)

## # A tibble: 12 × 5
##    group x_mean  x_sd y_mean  y_sd
##    <chr>  <dbl> <dbl>  <dbl> <dbl>
##  1 A       54.3  16.8   47.8  26.9
##  2 B       54.3  16.8   47.8  26.9
##  3 C       54.3  16.8   47.8  26.9
##  4 D       54.3  16.8   47.8  26.9
##  5 E       54.3  16.8   47.8  26.9
##  6 F       54.3  16.8   47.8  26.9
##  7 G       54.3  16.8   47.8  26.9
##  8 H       54.3  16.8   47.8  26.9
##  9 I       54.3  16.8   47.8  26.9
## 10 J       54.3  16.8   47.8  26.9
## 11 K       54.3  16.8   47.8  26.9
## 12 L       54.3  16.8   47.8  26.9

# Correlation between x and y for each group
correlations <- data %>%
  group_by(group) %>%
  summarise(correlation = cor(x, y))
print(correlations)

## # A tibble: 12 × 2
##    group correlation
##    <chr>       <dbl>
##  1 A         -0.0645
##  2 B         -0.0641
##  3 C         -0.0617
##  4 D         -0.0694
##  5 E         -0.0656
##  6 F         -0.0630
##  7 G         -0.0685
##  8 H         -0.0603
##  9 I         -0.0666
## 10 J         -0.0686
## 11 K         -0.0686
## 12 L         -0.0690

#Scatterplots in rows of three
ggplot(data, aes(x = x, y = y)) +
  geom_point() +
  facet_wrap(~ group, ncol = 4) + 
  #I used facet_wrap of ggplot2 which works in the same way as par(mfrow=c(,))in base R
  labs(title = "Scatterplots of y vs x by Group")

Observations: All groups show very weak Neg correlations (around -0.06), Correlation values are remarkably consistent across all 12 groups, All correlations are close to zero, indicating no meaningful linear relationship, The strongest correlation is Group H (-0.0603), weakest is Group C (-0.0617), Range is very narrow: only 0.0094 difference between min and max correlation, A looks like a dinosaur, B looks like a firework, C looks like 5 distinct horizontal straight lines, D looks like 4 distinct vertical straight lines,E looks like X, F looks like a star, G looks like two separate horizontal clusters, H looks like 9 points each point represents one cluster, I looks like two vertical clusters, J looks like a ring, K looks like tilted straight lines (from south west to north east), L looks like tilted straight lines (from south east to north west)

problem2

rm(list=ls())
par(mfrow = c(1, 1))

data(rock)
head(rock)

##   area    peri     shape perm
## 1 4990 2791.90 0.0903296  6.3
## 2 7002 3892.60 0.1486220  6.3
## 3 7558 3930.66 0.1833120  6.3
## 4 7352 3869.32 0.1170630  6.3
## 5 7943 3948.54 0.1224170 17.1
## 6 7979 4010.15 0.1670450 17.1

#linear regression
model <- lm(peri ~ area, data = rock)
summary(model)

## 
## Call:
## lm(formula = peri ~ area, data = rock)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2306.8  -502.3   122.5   564.5  1291.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -471.43579  342.77487  -1.375    0.176    
## area           0.43875    0.04473   9.808 7.51e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 823.1 on 46 degrees of freedom
## Multiple R-squared:  0.6765, Adjusted R-squared:  0.6695 
## F-statistic:  96.2 on 1 and 46 DF,  p-value: 7.506e-13

#studentized residuals
rock$student_resid <- rstudent(model)

#I used quantiles for balanced groups(3 groups)
rock$resid_group <- cut(abs(rock$student_resid),
                        breaks = quantile(abs(rock$student_resid), 
                                          probs = c(0, 0.33, 0.67, 1)),
                        labels = c("Low", "Medium", "High"))

#scatterplot
plot(rock$area, rock$peri,
     col = c("green", "blue", "red")[rock$resid_group],
     pch = c(16, 17, 18)[rock$resid_group],
     xlab = "Area", ylab = "Perimeter",
     main = "Rock Samples: Perimeter vs Area")

#regression line
abline(model, col = "darkgreen", lwd = 2)

legend("topleft",
       legend = c(levels(rock$resid_group), "Regression Line"),
       col = c("green", "blue", "red", "darkgreen"),
       pch = c(16, 17, 18, NA),
       lty = c(NA, NA, NA, 1),
       cex=0.6)

problem 3

set.seed(123)

#1000 samples from each distribution
normal_samples <- rnorm(1000, mean = -2, sd = 1)
gamma1_samples <- rgamma(1000, shape = 2, rate = 0.5)
gamma2_samples <- rgamma(1000, shape = 1, rate = 1)

#Side-by-side boxplots
boxplot(normal_samples, gamma1_samples, gamma2_samples,
        names = c("N(-2,1)", "Gamma(2,0.5)", "Gamma(1,1)"),
        main = "Side-by-Side Boxplots",
        ylab = "Values")

#Overlayed histograms
hist(normal_samples, col = rgb(1, 0, 0, 0.5), 
     xlim = c(-5, 15), ylim = c(0, 800),
     main = "Overlayed Histograms", 
     xlab = "Values", ylab = "Frequency")

hist(gamma1_samples, col = rgb(0, 1, 0, 0.5), add = TRUE)
hist(gamma2_samples, col = rgb(0, 0, 1, 0.5), add = TRUE)
legend("topright", 
       legend = c("N(-2,1)", "Gamma(2,0.5)", "Gamma(1,1)"),
       fill = c(rgb(1,0,0,0.5), rgb(0,1,0,0.5), rgb(0,0,1,0.5)))

problem4

library(mvtnorm)

# First distribution: y follows MVN(mu=[0,0], sigma=[[9,0],[0,9]])

x <- seq(-10, 10, length = 100)
y <- seq(-10, 10, length = 100)
grid <- expand.grid(x = x, y = y)

#PDF values for first distribution
mu1 <- c(0, 0)
sigma1 <- matrix(c(9, 0, 0, 9), nrow = 2)
z1 <- matrix(dmvnorm(grid, mean = mu1, sigma = sigma1), nrow = length(x))

#3D surface plot for first distribution
persp(x, y, z1,
      xlab = "y1", ylab = "y2", zlab = "Density",
      main = "Bivariate Normal: mu=[0,0], sigma=[[9,0],[0,9]]",
      theta = 30, phi = 30, col = "lightblue")

#Heatmap with contours for first distribution
filled.contour(x, y, z1,
               xlab = "y1", ylab = "y2",
               main = "Heatmap: mu=[0,0], sigma=[[9,0],[0,9]]")

# Second distribution: w ~ MVN(mu=[1,2], sigma=[[9,-8],[-8,9]])
#PDF values for second distribution
mu2 <- c(1, 2)
sigma2 <- matrix(c(9, -8, -8, 9), nrow = 2)
z2 <- matrix(dmvnorm(grid, mean = mu2, sigma = sigma2), nrow = length(x))

# 3D surface plot for second distribution
persp(x, y, z2,
      xlab = "w1", ylab = "w2", zlab = "Density",
      main = "Bivariate Normal: mu=[1,2], sigma=[[9,-8],[-8,9]]",
      theta = 30, phi = 30, col = "lightgreen")

#Heatmap with contours for second distribution
filled.contour(x, y, z2,
               xlab = "w1", ylab = "w2",
               main = "Heatmap: mu=[1,2], sigma=[[9,-8],[-8,9]]")

Homework 4

Sabuj Ganguly

2025-11-05

problem1

problem2

problem 3

problem4