Homework1

#setup

Q1: AutoClaims (age >= 50)

df <- readr::read_csv("AutoClaims-3.csv", show_col_types = FALSE)
df50 <- df %>% filter(age >= 50)
cat("nrow(df50) =", nrow(df50), "\n")

## nrow(df50) = 6773

print(summary(df50$paid))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     9.5   523.7  1001.7  1853.0  2137.4 60000.0

The mean paid is 1853.0 and the median is 1001.7.The median is lower than the mean,indicating a right-skewed distribution. ### Q1(b): Histogram and Normal QQ plot for paid

hist_plot <- ggplot(df50, aes(x = paid)) +
  geom_histogram(bins = 50, color = "black", fill = "grey70") +
  labs(title = "Q1(b) Histogram of paid", x = "paid", y = "Count")
hist_plot

qq_plot <- ggplot(df50, aes(sample = paid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Q1(b) Normal QQ plot for paid")
qq_plot

The histogram shows that the distribution of paid is highly right-skewed, mostobservations concentrated around zero, while a few extremely large claims dominate the upper tail of the distribution; The QQ plot shows strong deviations from the reference line, especially in the upper tail. In a word, the distribution of paid is far from normal distribution. ### Q1(c): Histogram and Normal QQ plot for logpaid (LNPAID)

df50_logpaid <- df50 %>% filter(!is.na(logpaid))
hist_log_plot <- ggplot(df50_logpaid, aes(x = logpaid)) +
  geom_histogram(bins = 50, color = "black", fill = "grey70") +
  labs(title = "Q1(c) Histogram of LNPAID (logpaid)", x = "logpaid (LNPAID)", y = "Count")
hist_log_plot

qq_log_plot <- ggplot(df50_logpaid, aes(sample = logpaid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Q1(c) Normal QQ plot for LNPAID (logpaid)")
qq_log_plot

The histogram of logpaid is much more symmetric compared to that of paid, indicating that the log transformation substantially reduces the strong right skewness observed in the original data. The QQ plot shows that most points lie close to the reference line, especially in the central region, suggesting that the distribution of logpaid is approximately normal.

Q1(d): Scatterplot of `logpaid` vs `age` with loess

df50_logpaid_age <- df50 %>% filter(!is.na(logpaid) & !is.na(age))
scatter_logpaid_age <- ggplot(df50_logpaid_age, aes(x = age, y = logpaid)) +
  geom_point(alpha = 0.5, size = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "blue") +
  labs(title = "Q1(d) Scatterplot of LNPAID vs age", x = "age", y = "logpaid")
scatter_logpaid_age

## `geom_smooth()` using formula = 'y ~ x'

The scatterplot shows no strong relationship between age and logpaid.

Q1(e): Scatterplot colored by gender with separate loess per gender

df50_logpaid_age_gender <- df50 %>%
  filter(!is.na(logpaid) & !is.na(age) & !is.na(gender)) %>%
  mutate(gender = as.factor(gender))

scatter_by_gender <- ggplot(df50_logpaid_age_gender,
                            aes(x = age, y = logpaid, color = gender)) +
  geom_point(alpha = 0.6, size = 1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Q1(e) LNPAID vs age by gender",
       x = "age", y = "logpaid")
scatter_by_gender

## `geom_smooth()` using formula = 'y ~ x'

Differences by gender appear small; loess lines may show slight level shifts but no strong divergent trend.

Q2: Boston housing data

library(MASS)
boston <- MASS::Boston
cont_sample <- boston %>% select_if(is.numeric)

cat("Dimensions:", dim(boston), "\n")

## Dimensions: 506 14

print(summary(boston))

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

###Q2(a)

p <- ggpairs(cont_sample,
             title = "Q2(a) Pairwise plots (continuous predictors)")
p

For Q2, I chose to compute a ggpairs scatterplots, then analysis the following questions based on this graph.

###Q2(b,c): High crime, tax and pupil-teacher ratio

top_crim <- boston[order(-boston$crim), ][1:5, ]
top_tax  <- boston[order(-boston$tax), ][1:5, ]
top_ptr  <- boston[order(-boston$ptratio), ][1:5, ]

top_crim

##        crim zn indus chas   nox    rm   age    dis rad tax ptratio  black lstat
## 381 88.9762  0  18.1    0 0.671 6.968  91.9 1.4165  24 666    20.2 396.90 17.21
## 419 73.5341  0  18.1    0 0.679 5.957 100.0 1.8026  24 666    20.2  16.45 20.62
## 406 67.9208  0  18.1    0 0.693 5.683 100.0 1.4254  24 666    20.2 384.97 22.98
## 411 51.1358  0  18.1    0 0.597 5.757 100.0 1.4130  24 666    20.2   2.60 10.11
## 415 45.7461  0  18.1    0 0.693 4.519 100.0 1.6582  24 666    20.2  88.27 36.98
##     medv
## 381 10.4
## 419  8.8
## 406  5.0
## 411 15.0
## 415  7.0

top_tax

##        crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 489 0.15086  0 27.74    0 0.609 5.454 92.7 1.8209   4 711    20.1 395.09 18.06
## 490 0.18337  0 27.74    0 0.609 5.414 98.3 1.7554   4 711    20.1 344.05 23.97
## 491 0.20746  0 27.74    0 0.609 5.093 98.0 1.8226   4 711    20.1 318.43 29.68
## 492 0.10574  0 27.74    0 0.609 5.983 98.8 1.8681   4 711    20.1 390.11 18.07
## 493 0.11132  0 27.74    0 0.609 5.983 83.5 2.1099   4 711    20.1 396.90 13.35
##     medv
## 489 15.2
## 490  7.0
## 491  8.1
## 492 13.6
## 493 20.1

top_ptr

##        crim zn indus chas   nox    rm  age     dis rad tax ptratio  black lstat
## 355 0.04301 80  1.91    0 0.413 5.663 21.9 10.5857   4 334    22.0 382.80  8.05
## 356 0.10659 80  1.91    0 0.413 5.936 19.5 10.5857   4 334    22.0 376.04  5.57
## 128 0.25915  0 21.89    0 0.624 5.693 96.0  1.7883   4 437    21.2 392.11 17.19
## 129 0.32543  0 21.89    0 0.624 6.431 98.8  1.8125   4 437    21.2 396.90 15.39
## 130 0.88125  0 21.89    0 0.624 5.637 94.7  1.9799   4 437    21.2 396.90 18.34
##     medv
## 355 18.2
## 356 20.6
## 128 16.2
## 129 18.0
## 130 14.3

Several predictors are strongly associated with per capita crime rate. The strongest positive associations are with rad (≈ 0.63), tax (≈ 0.58), indus (≈ 0.41), and nox (≈ 0.42), while the strongest negative associations are with dis (≈ −0.38), medv (≈ −0.38), and rm (≈ −0.22).

###Q2(d): Charles River

sum(boston$chas == 1)

## [1] 35

35 suburbs are located along the Charles River. ###Q2(e): Median pupil-teacher ratio

median(boston$ptratio)

## [1] 19.05

The median pupil-teacher ratio is 19.05. ###Q2(f): Lowest house value

idx <- which.min(boston$medv)
boston[idx, ]

##        crim zn indus chas   nox    rm age    dis rad tax ptratio black lstat
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 396.9 30.59
##     medv
## 399    5

399 has the lowest 399, with medv = 5.This suburb has extremely high crime rate (crim = 38.35), very high tax rate (tax = 666), high pupil–teacher ratio (ptratio = 20.2), high pollution level (nox = 0.693), very low distance to employment centers (dis = 1.49), small average number of rooms (rm = 5.45), old housing stock (age = 100), and very high proportion of lower-status population (lstat = 30.59)

###Q2(g): More than 7 or 8 rooms

sum(boston$rm > 7)

## [1] 64

sum(boston$rm > 8)

## [1] 13

boston[boston$rm > 8, ]

##        crim zn indus chas    nox    rm  age    dis rad tax ptratio  black lstat
## 98  0.12083  0  2.89    0 0.4450 8.069 76.0 3.4952   2 276    18.0 396.90  4.21
## 164 1.51902  0 19.58    1 0.6050 8.375 93.9 2.1620   5 403    14.7 388.45  3.32
## 205 0.02009 95  2.68    0 0.4161 8.034 31.9 5.1180   4 224    14.7 390.55  2.88
## 225 0.31533  0  6.20    0 0.5040 8.266 78.3 2.8944   8 307    17.4 385.05  4.14
## 226 0.52693  0  6.20    0 0.5040 8.725 83.0 2.8944   8 307    17.4 382.00  4.63
## 227 0.38214  0  6.20    0 0.5040 8.040 86.5 3.2157   8 307    17.4 387.38  3.13
## 233 0.57529  0  6.20    0 0.5070 8.337 73.3 3.8384   8 307    17.4 385.91  2.47
## 234 0.33147  0  6.20    0 0.5070 8.247 70.4 3.6519   8 307    17.4 378.95  3.95
## 254 0.36894 22  5.86    0 0.4310 8.259  8.4 8.9067   7 330    19.1 396.90  3.54
## 258 0.61154 20  3.97    0 0.6470 8.704 86.9 1.8010   5 264    13.0 389.70  5.12
## 263 0.52014 20  3.97    0 0.6470 8.398 91.5 2.2885   5 264    13.0 386.86  5.91
## 268 0.57834 20  3.97    0 0.5750 8.297 67.0 2.4216   5 264    13.0 384.54  7.44
## 365 3.47428  0 18.10    1 0.7180 8.780 82.9 1.9047  24 666    20.2 354.55  5.29
##     medv
## 98  38.7
## 164 50.0
## 205 50.0
## 225 44.8
## 226 50.0
## 227 37.6
## 233 41.7
## 234 48.3
## 254 42.8
## 258 50.0
## 263 48.8
## 268 50.0
## 365 21.9

There are 64 suburbs with more than seven rooms per dwelling, and only 13 with more than eight rooms. These suburbs represent the upper end of housing quality and are likely to be more affluent areas.

Q3: K-Nearest Neighbors

q3_df <- tibble::tibble(
  Obs = 1:7,
  X1 = c(0, 1.5, 0, 0, -1, 1, -1),
  X2 = c(3, 0, 1, 1, 0, 1, 3),
  X3 = c(0, 1, 3, 2, 1, 1, -1),
  Y = c("Red", "Red", "Red", "Green", "Green", "Red", "Green")
)

q3_df <- q3_df %>%
  mutate(distance = sqrt(X1^2 + X2^2 + X3^2)) %>%
  arrange(distance)

q3_df

## # A tibble: 7 × 6
##     Obs    X1    X2    X3 Y     distance
##   <int> <dbl> <dbl> <dbl> <chr>    <dbl>
## 1     5  -1       0     1 Green     1.41
## 2     6   1       1     1 Red       1.73
## 3     2   1.5     0     1 Red       1.80
## 4     4   0       1     2 Green     2.24
## 5     1   0       3     0 Red       3   
## 6     3   0       1     3 Red       3.16
## 7     7  -1       3    -1 Green     3.32

Q3(b): K = 1 With K = 1, the nearest neighbor is observation 5 (distance = 1.41), which is Green. Therefore, the predicted class is Green.

Q3(c): K = 3 The three nearest neighbors are observations 5 (Green), 6 (Red), and 2 (Red). Since two out of three are Red, the predicted class is Red.

Q3(d): We would expect the best value of K to be small, because a smaller K allows the decision boundary to be more flexible and better capture highly non-linear patterns

Homework1

Huaiyu Ruan

2026-01-25

Q1: AutoClaims (age >= 50)

Q1(d): Scatterplot of `logpaid` vs `age` with loess

Q1(e): Scatterplot colored by gender with separate loess per gender

Q2: Boston housing data

Q3: K-Nearest Neighbors

Homework1

Huaiyu Ruan

2026-01-25

Q1: AutoClaims (age >= 50)

Q1(d): Scatterplot of logpaid vs age with loess

Q1(e): Scatterplot colored by gender with separate loess per gender

Q2: Boston housing data

Q3: K-Nearest Neighbors

Q1(d): Scatterplot of `logpaid` vs `age` with loess