This report includes:
# Load Boston dataset
data("Boston")
# Run regression
model_b <- lm(medv ~ crim, data = Boston)
# Show results
summary(model_b)##
## Call:
## lm(formula = medv ~ crim, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.957 -5.449 -2.007 2.512 29.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.03311 0.40914 58.74 <2e-16 ***
## crim -0.41519 0.04389 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.484 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
The p-value for crim is examined.
If:
p < 0.05 → significant
Result:
The crime rate is statistically significant because its p-value is less than 0.05.
The coefficient for crime rate is negative.
This means:
Higher crime rates are associated with lower median house values.
Variables added:
##
## Call:
## lm(formula = medv ~ crim + rm + ptratio + nox, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.966 -3.207 -0.563 1.881 39.588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.64051 4.36496 1.521 0.129
## crim -0.13906 0.03362 -4.136 4.14e-05 ***
## rm 6.90415 0.40150 17.196 < 2e-16 ***
## ptratio -1.06741 0.12943 -8.247 1.44e-15 ***
## nox -13.15253 2.49470 -5.272 2.01e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.724 on 501 degrees of freedom
## Multiple R-squared: 0.6157, Adjusted R-squared: 0.6126
## F-statistic: 200.6 on 4 and 501 DF, p-value: < 2.2e-16
rm (rooms)
Positive coefficient → More rooms increase house value.
nox (pollution)
Negative coefficient → More pollution decreases house value.
Boston$residuals <- resid(model_b)
Boston$fitted <- fitted(model_b)
ggplot(Boston,
aes(x = fitted,
y = residuals)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
title = "Residual Plot (Fitted vs Residuals)",
x = "Fitted Values",
y = "Residuals"
)## `geom_smooth()` using formula = 'y ~ x'
The slope of the trendline is close to zero.
This indicates that the model fits reasonably well.
Files used:
extreme <- read.csv("extreme.csv")
ggplot(extreme,
aes(x = hour + (day-1)*24,
y = sealevel,
color = factor(year))) +
geom_line() +
labs(
title = "Sea Level During Extreme Days",
x = "Hour",
y = "Sea Level (cm)",
color = "Year"
)sealevel <- read.csv("sealevel.csv")
# Convert date
sealevel$date <- as.Date(sealevel$date)
# Extract year
sealevel$year <- format(sealevel$date, "%Y")
# Create yearly statistics (THIS WAS MISSING — FIXED)
year_data <- sealevel %>%
group_by(year) %>%
summarise(
mean_level = mean(sealevel, na.rm = TRUE),
max_level = max(sealevel, na.rm = TRUE),
min_level = min(sealevel, na.rm = TRUE),
.groups = "drop"
)
ggplot(year_data,
aes(x = as.numeric(year))) +
geom_line(aes(y = mean_level,
color = "Mean")) +
geom_line(aes(y = max_level,
color = "Max")) +
geom_line(aes(y = min_level,
color = "Min")) +
scale_color_manual(values = c(
"Mean" = "red",
"Max" = "blue",
"Min" = "green"
)) +
labs(
title = "Sea Level Statistics per Year",
x = "Year",
y = "Sea Level (cm)",
color = "Statistic"
)# Extract month and year
sealevel$month <- month(sealevel$date)
sealevel$year <- year(sealevel$date)
# Keep years 1960–2009
data_10yr <- sealevel %>%
filter(year >= 1960 & year <= 2009)
# Create 10-year periods
data_10yr$period <- cut(
data_10yr$year,
breaks = seq(1960, 2010, by = 10),
right = FALSE,
labels = paste(
seq(1960, 2000, by = 10),
seq(1969, 2009, by = 10),
sep = "-"
)
)
# Calculate median values
month_data <- data_10yr %>%
group_by(period, month) %>%
summarise(
median_level = median(sealevel, na.rm = TRUE),
.groups = "drop"
)
ggplot(month_data,
aes(x = month,
y = median_level,
fill = period)) +
geom_col(position = "dodge") +
labs(
title = "Median Sea Level per Month (10-year periods)",
x = "Month",
y = "Sea Level (cm)",
fill = "Period"
)