#setup
df <- readr::read_csv("AutoClaims-3.csv", show_col_types = FALSE)
df50 <- df %>% filter(age >= 50)
cat("nrow(df50) =", nrow(df50), "\n")
## nrow(df50) = 6773
print(summary(df50$paid))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.5 523.7 1001.7 1853.0 2137.4 60000.0
The mean paid is 1853.0 and the median is 1001.7.The median is lower
than the mean,indicating a right-skewed distribution. ### Q1(b):
Histogram and Normal QQ plot for paid
hist_plot <- ggplot(df50, aes(x = paid)) +
geom_histogram(bins = 50, color = "black", fill = "grey70") +
labs(title = "Q1(b) Histogram of paid", x = "paid", y = "Count")
hist_plot
qq_plot <- ggplot(df50, aes(sample = paid)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q1(b) Normal QQ plot for paid")
qq_plot
The histogram shows that the distribution of paid is highly
right-skewed, mostobservations concentrated around zero, while a few
extremely large claims dominate the upper tail of the distribution; The
QQ plot shows strong deviations from the reference line, especially in
the upper tail. In a word, the distribution of paid is far from normal
distribution. ### Q1(c): Histogram and Normal QQ plot for
logpaid (LNPAID)
df50_logpaid <- df50 %>% filter(!is.na(logpaid))
hist_log_plot <- ggplot(df50_logpaid, aes(x = logpaid)) +
geom_histogram(bins = 50, color = "black", fill = "grey70") +
labs(title = "Q1(c) Histogram of LNPAID (logpaid)", x = "logpaid (LNPAID)", y = "Count")
hist_log_plot
qq_log_plot <- ggplot(df50_logpaid, aes(sample = logpaid)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q1(c) Normal QQ plot for LNPAID (logpaid)")
qq_log_plot
The histogram of logpaid is much more symmetric compared to that of
paid, indicating that the log transformation substantially reduces the
strong right skewness observed in the original data. The QQ plot shows
that most points lie close to the reference line, especially in the
central region, suggesting that the distribution of logpaid is
approximately normal.
logpaid vs age with
loessdf50_logpaid_age <- df50 %>% filter(!is.na(logpaid) & !is.na(age))
scatter_logpaid_age <- ggplot(df50_logpaid_age, aes(x = age, y = logpaid)) +
geom_point(alpha = 0.5, size = 1) +
geom_smooth(method = "loess", se = TRUE, color = "blue") +
labs(title = "Q1(d) Scatterplot of LNPAID vs age", x = "age", y = "logpaid")
scatter_logpaid_age
## `geom_smooth()` using formula = 'y ~ x'
The scatterplot shows no strong relationship between age and
logpaid.
df50_logpaid_age_gender <- df50 %>%
filter(!is.na(logpaid) & !is.na(age) & !is.na(gender)) %>%
mutate(gender = as.factor(gender))
scatter_by_gender <- ggplot(df50_logpaid_age_gender,
aes(x = age, y = logpaid, color = gender)) +
geom_point(alpha = 0.6, size = 1) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Q1(e) LNPAID vs age by gender",
x = "age", y = "logpaid")
scatter_by_gender
## `geom_smooth()` using formula = 'y ~ x'
Differences by gender appear small; loess lines may show slight level
shifts but no strong divergent trend.
library(MASS)
boston <- MASS::Boston
cont_sample <- boston %>% select_if(is.numeric)
cat("Dimensions:", dim(boston), "\n")
## Dimensions: 506 14
print(summary(boston))
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
###Q2(a)
p <- ggpairs(cont_sample,
title = "Q2(a) Pairwise plots (continuous predictors)")
p
For Q2, I chose to compute a ggpairs scatterplots, then analysis the
following questions based on this graph.
###Q2(b,c): High crime, tax and pupil-teacher ratio
top_crim <- boston[order(-boston$crim), ][1:5, ]
top_tax <- boston[order(-boston$tax), ][1:5, ]
top_ptr <- boston[order(-boston$ptratio), ][1:5, ]
top_crim
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 381 88.9762 0 18.1 0 0.671 6.968 91.9 1.4165 24 666 20.2 396.90 17.21
## 419 73.5341 0 18.1 0 0.679 5.957 100.0 1.8026 24 666 20.2 16.45 20.62
## 406 67.9208 0 18.1 0 0.693 5.683 100.0 1.4254 24 666 20.2 384.97 22.98
## 411 51.1358 0 18.1 0 0.597 5.757 100.0 1.4130 24 666 20.2 2.60 10.11
## 415 45.7461 0 18.1 0 0.693 4.519 100.0 1.6582 24 666 20.2 88.27 36.98
## medv
## 381 10.4
## 419 8.8
## 406 5.0
## 411 15.0
## 415 7.0
top_tax
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 489 0.15086 0 27.74 0 0.609 5.454 92.7 1.8209 4 711 20.1 395.09 18.06
## 490 0.18337 0 27.74 0 0.609 5.414 98.3 1.7554 4 711 20.1 344.05 23.97
## 491 0.20746 0 27.74 0 0.609 5.093 98.0 1.8226 4 711 20.1 318.43 29.68
## 492 0.10574 0 27.74 0 0.609 5.983 98.8 1.8681 4 711 20.1 390.11 18.07
## 493 0.11132 0 27.74 0 0.609 5.983 83.5 2.1099 4 711 20.1 396.90 13.35
## medv
## 489 15.2
## 490 7.0
## 491 8.1
## 492 13.6
## 493 20.1
top_ptr
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 355 0.04301 80 1.91 0 0.413 5.663 21.9 10.5857 4 334 22.0 382.80 8.05
## 356 0.10659 80 1.91 0 0.413 5.936 19.5 10.5857 4 334 22.0 376.04 5.57
## 128 0.25915 0 21.89 0 0.624 5.693 96.0 1.7883 4 437 21.2 392.11 17.19
## 129 0.32543 0 21.89 0 0.624 6.431 98.8 1.8125 4 437 21.2 396.90 15.39
## 130 0.88125 0 21.89 0 0.624 5.637 94.7 1.9799 4 437 21.2 396.90 18.34
## medv
## 355 18.2
## 356 20.6
## 128 16.2
## 129 18.0
## 130 14.3
Several predictors are strongly associated with per capita crime rate. The strongest positive associations are with rad (≈ 0.63), tax (≈ 0.58), indus (≈ 0.41), and nox (≈ 0.42), while the strongest negative associations are with dis (≈ −0.38), medv (≈ −0.38), and rm (≈ −0.22).
###Q2(d): Charles River
sum(boston$chas == 1)
## [1] 35
35 suburbs are located along the Charles River. ###Q2(e): Median pupil-teacher ratio
median(boston$ptratio)
## [1] 19.05
The median pupil-teacher ratio is 19.05. ###Q2(f): Lowest house value
idx <- which.min(boston$medv)
boston[idx, ]
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.9 30.59
## medv
## 399 5
399 has the lowest 399, with medv = 5.This suburb has extremely high crime rate (crim = 38.35), very high tax rate (tax = 666), high pupil–teacher ratio (ptratio = 20.2), high pollution level (nox = 0.693), very low distance to employment centers (dis = 1.49), small average number of rooms (rm = 5.45), old housing stock (age = 100), and very high proportion of lower-status population (lstat = 30.59)
###Q2(g): More than 7 or 8 rooms
sum(boston$rm > 7)
## [1] 64
sum(boston$rm > 8)
## [1] 13
boston[boston$rm > 8, ]
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 396.90 4.21
## 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 388.45 3.32
## 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 390.55 2.88
## 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 385.05 4.14
## 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 382.00 4.63
## 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 387.38 3.13
## 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 385.91 2.47
## 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 378.95 3.95
## 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 396.90 3.54
## 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 389.70 5.12
## 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 386.86 5.91
## 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 384.54 7.44
## 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 354.55 5.29
## medv
## 98 38.7
## 164 50.0
## 205 50.0
## 225 44.8
## 226 50.0
## 227 37.6
## 233 41.7
## 234 48.3
## 254 42.8
## 258 50.0
## 263 48.8
## 268 50.0
## 365 21.9
There are 64 suburbs with more than seven rooms per dwelling, and only 13 with more than eight rooms. These suburbs represent the upper end of housing quality and are likely to be more affluent areas.
q3_df <- tibble::tibble(
Obs = 1:7,
X1 = c(0, 1.5, 0, 0, -1, 1, -1),
X2 = c(3, 0, 1, 1, 0, 1, 3),
X3 = c(0, 1, 3, 2, 1, 1, -1),
Y = c("Red", "Red", "Red", "Green", "Green", "Red", "Green")
)
q3_df <- q3_df %>%
mutate(distance = sqrt(X1^2 + X2^2 + X3^2)) %>%
arrange(distance)
q3_df
## # A tibble: 7 × 6
## Obs X1 X2 X3 Y distance
## <int> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 5 -1 0 1 Green 1.41
## 2 6 1 1 1 Red 1.73
## 3 2 1.5 0 1 Red 1.80
## 4 4 0 1 2 Green 2.24
## 5 1 0 3 0 Red 3
## 6 3 0 1 3 Red 3.16
## 7 7 -1 3 -1 Green 3.32
Q3(b): K = 1 With K = 1, the nearest neighbor is observation 5 (distance = 1.41), which is Green. Therefore, the predicted class is Green.
Q3(c): K = 3 The three nearest neighbors are observations 5 (Green), 6 (Red), and 2 (Red). Since two out of three are Red, the predicted class is Red.
Q3(d): We would expect the best value of K to be small, because a smaller K allows the decision boundary to be more flexible and better capture highly non-linear patterns