Introduction

This report includes:

Linear regression analysis using the Boston housing dataset
Visualization of sea level data
Statistical interpretation of results

Load Required Libraries

library(MASS)
library(ggplot2)
library(dplyr)
library(lubridate)

PART 1 — Linear Regression (Boston Dataset)

Run Regression: medv vs crim

# Load Boston dataset
data("Boston")

# Run regression
model_b <- lm(medv ~ crim, data = Boston)

# Show results
summary(model_b)

## 
## Call:
## lm(formula = medv ~ crim, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.957  -5.449  -2.007   2.512  29.800 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.03311    0.40914   58.74   <2e-16 ***
## crim        -0.41519    0.04389   -9.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.484 on 504 degrees of freedom
## Multiple R-squared:  0.1508, Adjusted R-squared:  0.1491 
## F-statistic: 89.49 on 1 and 504 DF,  p-value: < 2.2e-16

Crime Rate Significance

The p-value for crim is examined.

If:

p < 0.05 → significant

Result:

The crime rate is statistically significant because its p-value is less than 0.05.

Interpretation of Crime Effect

The coefficient for crime rate is negative.

This means:

Higher crime rates are associated with lower median house values.

Multiple Linear Regression

Variables added:

rm → rooms
ptratio → pupil-teacher ratio
nox → nitrogen oxides

model_c <- lm(
  medv ~ crim + rm + ptratio + nox,
  data = Boston
)

summary(model_c)

## 
## Call:
## lm(formula = medv ~ crim + rm + ptratio + nox, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.966  -3.207  -0.563   1.881  39.588 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.64051    4.36496   1.521    0.129    
## crim         -0.13906    0.03362  -4.136 4.14e-05 ***
## rm            6.90415    0.40150  17.196  < 2e-16 ***
## ptratio      -1.06741    0.12943  -8.247 1.44e-15 ***
## nox         -13.15253    2.49470  -5.272 2.01e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.724 on 501 degrees of freedom
## Multiple R-squared:  0.6157, Adjusted R-squared:  0.6126 
## F-statistic: 200.6 on 4 and 501 DF,  p-value: < 2.2e-16

Interpretation of rm and nox

rm (rooms)
Positive coefficient → More rooms increase house value.

nox (pollution)
Negative coefficient → More pollution decreases house value.

Residual Plot

Boston$residuals <- resid(model_b)
Boston$fitted <- fitted(model_b)

ggplot(Boston,
       aes(x = fitted,
           y = residuals)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    title = "Residual Plot (Fitted vs Residuals)",
    x = "Fitted Values",
    y = "Residuals"
  )

## `geom_smooth()` using formula = 'y ~ x'

Residual Interpretation

The slope of the trendline is close to zero.

This indicates that the model fits reasonably well.

PART 2 — Sea Level Visualization

Files used:

extreme.csv
sealevel.csv

Extreme Sea Level Plot (48 Hours)

extreme <- read.csv("extreme.csv")

ggplot(extreme,
       aes(x = hour + (day-1)*24,
           y = sealevel,
           color = factor(year))) +
  geom_line() +
  labs(
    title = "Sea Level During Extreme Days",
    x = "Hour",
    y = "Sea Level (cm)",
    color = "Year"
  )

Mean, Max, Min Sea Level per Year

sealevel <- read.csv("sealevel.csv")

# Convert date
sealevel$date <- as.Date(sealevel$date)

# Extract year
sealevel$year <- format(sealevel$date, "%Y")

# Create yearly statistics (THIS WAS MISSING — FIXED)

year_data <- sealevel %>%
  group_by(year) %>%
  summarise(
    mean_level = mean(sealevel, na.rm = TRUE),
    max_level  = max(sealevel, na.rm = TRUE),
    min_level  = min(sealevel, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(year_data,
       aes(x = as.numeric(year))) +
  
  geom_line(aes(y = mean_level,
                color = "Mean")) +
  
  geom_line(aes(y = max_level,
                color = "Max")) +
  
  geom_line(aes(y = min_level,
                color = "Min")) +
  
  scale_color_manual(values = c(
    "Mean" = "red",
    "Max"  = "blue",
    "Min"  = "green"
  )) +
  
  labs(
    title = "Sea Level Statistics per Year",
    x = "Year",
    y = "Sea Level (cm)",
    color = "Statistic"
  )

Median Sea Level per Month (10-Year Periods)

# Extract month and year

sealevel$month <- month(sealevel$date)
sealevel$year  <- year(sealevel$date)

# Keep years 1960–2009

data_10yr <- sealevel %>%
  filter(year >= 1960 & year <= 2009)

# Create 10-year periods

data_10yr$period <- cut(
  data_10yr$year,
  breaks = seq(1960, 2010, by = 10),
  right = FALSE,
  labels = paste(
    seq(1960, 2000, by = 10),
    seq(1969, 2009, by = 10),
    sep = "-"
  )
)

# Calculate median values

month_data <- data_10yr %>%
  group_by(period, month) %>%
  summarise(
    median_level = median(sealevel, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(month_data,
       aes(x = month,
           y = median_level,
           fill = period)) +
  
  geom_col(position = "dodge") +
  
  labs(
    title = "Median Sea Level per Month (10-year periods)",
    x = "Month",
    y = "Sea Level (cm)",
    fill = "Period"
  )

Assignment 2: Linear Regression and Sea Level Analysis

Iman makhzoum

2026-04-19