9.a)

Auto <- na.omit(Auto)
Quantitative Qualitative
mpg cylinders
displacement origin
horsepower name
weight
acceleration
year

9.b)

auto_df <- na.omit(Auto)

# Quantitative predictors within auto_df
quantitative_predictors <- names(auto_df)[sapply(auto_df, is.numeric)]

# Range data frame
range_df <- data.frame(
  Predictor = quantitative_predictors,
  Minimum = NA,
  Maximum = NA,
  Range = NA
)

# Calculate range
for (i in 1:nrow(range_df)) {
  col <- range_df$Predictor[i]
  range <- range(auto_df[, col])
  range_df[i, 2:4] <- c(range[1], range[2], range[2] - range[1])
}

print(range_df)
##      Predictor Minimum Maximum  Range
## 1          mpg       9    46.6   37.6
## 2    cylinders       3     8.0    5.0
## 3 displacement      68   455.0  387.0
## 4   horsepower      46   230.0  184.0
## 5       weight    1613  5140.0 3527.0
## 6 acceleration       8    24.8   16.8
## 7         year      70    82.0   12.0
## 8       origin       1     3.0    2.0

9.c)

# Data frame initialization
summary_df <- data.frame(
  Predictor = quantitative_predictors,
  Mean = NA,
  SD = NA
)

# Calculating mean and standard deviation
for (i in 1:nrow(summary_df)) {
  col <- summary_df$Predictor[i]
  summary_df[i, 2:3] <- c(mean(auto_df[, col]), sd(auto_df[, col]))
}

print(summary_df)
##      Predictor        Mean          SD
## 1          mpg   23.445918   7.8050075
## 2    cylinders    5.471939   1.7057832
## 3 displacement  194.411990 104.6440039
## 4   horsepower  104.469388  38.4911599
## 5       weight 2977.584184 849.4025600
## 6 acceleration   15.541327   2.7588641
## 7         year   75.979592   3.6837365
## 8       origin    1.576531   0.8055182

9.d)

# Excluding observations 10 to 85
subset_data <- auto_df[-(10:85), ]

# Identify quantitative predictors within the subset
quantitative_predictors <- names(subset_data)[sapply(subset_data, is.numeric)]

# Initializing data frame
summary_df <- data.frame(
  Predictor = quantitative_predictors,
  Minimum = NA,
  Maximum = NA,
  Range = NA,
  Mean = NA,
  SD = NA
)

# Calculating range, mean, and standard deviation
for (i in 1:nrow(summary_df)) {
  col <- summary_df$Predictor[i]
  range <- range(subset_data[, col])
  summary_df[i, 2:4] <- c(range[1], range[2], range[2] - range[1])
  summary_df[i, 5:6] <- c(mean(subset_data[, col]), sd(subset_data[, col]))
}

print(summary_df)
##      Predictor Minimum Maximum  Range        Mean         SD
## 1          mpg    11.0    46.6   35.6   24.404430   7.867283
## 2    cylinders     3.0     8.0    5.0    5.373418   1.654179
## 3 displacement    68.0   455.0  387.0  187.240506  99.678367
## 4   horsepower    46.0   230.0  184.0  100.721519  35.708853
## 5       weight  1649.0  4997.0 3348.0 2935.971519 811.300208
## 6 acceleration     8.5    24.8   16.3   15.726899   2.693721
## 7         year    70.0    82.0   12.0   77.145570   3.106217
## 8       origin     1.0     3.0    2.0    1.601266   0.819910

9.e)

pairs(Auto)

plot(Auto$mpg, Auto$displacement) #relationship between fuel efficiency (mpg) and engine displacement

plot(Auto$horsepower, Auto$mpg) #relationship between horsepower and fuel efficiency (mpg)

plot(Auto$cylinders, Auto$mpg) #relationship between the number of cylinders and fuel efficiency (mpg)

plot(Auto$acceleration, Auto$mpg) #relationship between acceleration and fuel efficiency (mpg)

The graph shows a negative correlation between fuel efficiency (mpg) and engine displacement. This could be because cars with larger engines or higher displacement usually have lower fuel efficiency.

The graph shows a negative correlation between horsepower and fuel efficiency. This indicates that cars with higher horsepower tend to have lower fuel efficiency.

The graph shows a negative correlation between the number of cylinders and fuel efficiency. This means cars with more cylinders usually have lower fuel efficiency.

The graph shows a negative correlation between acceleration and fuel efficiency. This indicates that cars that accelerate faster get worse gas mileage. Since it takes more energy to accelerate a car quickly, this energy comes from burning fuel.

9.f)

cor(Auto$displacement, Auto$mpg)
## [1] -0.8051269
cor(Auto$horsepower, Auto$mpg)
## [1] -0.7784268
cor(Auto$year, Auto$displacement)
## [1] -0.3698552
cor(Auto$weight, Auto$mpg)
## [1] -0.8322442
cor(Auto$cylinders, Auto$mpg)
## [1] -0.7776175
cor(Auto$origin, Auto$mpg)
## [1] 0.5652088

Using the correlations above Displacement, Horsepower, Year, and Cylinders can be used as predictors for mpg.

10.a)

?Boston
## starting httpd help server ... done
dim(Boston) # Number of rows and Columns
## [1] 506  13
colnames(Boston) # Name of columns
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "lstat"   "medv"

There are 506 rows and 13 columns in this dataset.
The columns in the dataset are:
crim: per capita crime rate by town.
zn: proportion of residential land zoned for lots over 25,000 sq.ft.
indus: proportion of non-retail business acres per town.
chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
nox: nitrogen oxides concentration (parts per 10 million).
rm: average number of rooms per dwelling.
age: proportion of owner-occupied units built prior to 1940.
dis: weighted mean of distances to five Boston employment centres.
rad: index of accessibility to radial highways.
tax: full-value property-tax rate per $10,000.
ptratio: pupil-teacher ratio by town.
lstat: lower status of the population (percent).
medv: median value of owner-occupied homes in $1000s.

10.b)

pairs(Boston)

plot(Boston$rm, Boston$medv) # Relation between housing prices and the number of rooms

plot(Boston$ptratio, Boston$medv) # Relationship between Pupil-teacher ratio and housing prices

plot(Boston$dis, Boston$medv) # Relationship between accessibility to jobs with housing prices

plot(Boston$crim, Boston$lstat) # Relationship between crime rate and lower socioeconomic status

plot(Boston$nox, Boston$tax) # Relationship between pollution and tax rates

The graph shows a positive correlation. This means that as the number of rooms in a house increases, the price of the house also tends to increase

The graph shows a weak negative correlation, which means that towns with a higher pupil-teacher ratio tend to have lower median housing prices.

The graph shows a negative correlation between the two variables, meaning that as the accessibility to jobs increases, the median housing price decrease.

The graph shows a positive correlation, meaning that areas with higher air pollution levels tend to have higher property tax rates.

10.c)

plot(Boston$medv, Boston$crim) # Relationship between crime rate and lower socioeconomic status

The graph shows a positive correlation, meaning that towns with lower median values tend to have higher crime rates. However, few towns with the most expensive house also have higher crime rate.

10.d)

summary(Boston$crim)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620
summary(Boston$tax)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0
summary(Boston$ptratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00
# Crime Rates
ggplot(Boston, aes(x = crim)) +
  geom_histogram(bins = 50, fill = "blue", color = "black") +
  theme_minimal() +
  ggtitle("Distribution of Crime Rates")

# Tax Rates
ggplot(Boston, aes(x = tax)) +
  geom_histogram(bins = 50, fill = "red", color = "black") +
  theme_minimal() +
  ggtitle("Distribution of Tax Rates")

# Pupil-Teacher Ratios
ggplot(Boston, aes(x = ptratio)) +
  geom_histogram(bins = 50, fill = "green", color = "black") +
  theme_minimal() +
  ggtitle("Distribution of Pupil-Teacher Ratios")

The histogram shows a right skewed distribution of crime rates. Most tracts have low crime rates but a few tracts show high crime rates.

The histogram indicates tracts tend to have tax rates clustered around two different tax rate levels.

The distribution is slightly left skewed with most tracts generally having high-pupil tracher ratios across Boston.

10.e)

number_of_tracts <- sum(Boston$chas)
print(paste("Number of tracts that bound the Charles River:", number_of_tracts))
## [1] "Number of tracts that bound the Charles River: 35"

10.f)

median_ptratio <- median(Boston$ptratio)
print(paste("Median pupil-teacher ratio:", median_ptratio))
## [1] "Median pupil-teacher ratio: 19.05"

10.g)

lowest_medv_tract <- Boston[Boston$medv == min(Boston$medv), ]
print(lowest_medv_tract)
##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5
## 406 67.9208  0  18.1    0 0.693 5.683 100 1.4254  24 666    20.2 22.98    5

The census tract which has lowest median value of owner occupied homes (medv) in the dataset are tracts 399 and 406.

overall_ranges <- summary(Boston)
print(overall_ranges)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

10.h)

more_than_seven_rooms <- sum(Boston$rm > 7)

more_than_eight_rooms <- sum(Boston$rm > 8)

print(paste("Number of tracts with more than 7 rooms:", more_than_seven_rooms))
## [1] "Number of tracts with more than 7 rooms: 64"
print(paste("Number of tracts with more than 8 rooms:", more_than_eight_rooms))
## [1] "Number of tracts with more than 8 rooms: 13"
tracts_more_than_eight_rooms <- Boston[Boston$rm > 8, ]
summary(tracts_more_than_eight_rooms)
##       crim               zn            indus             chas       
##  Min.   :0.02009   Min.   : 0.00   Min.   : 2.680   Min.   :0.0000  
##  1st Qu.:0.33147   1st Qu.: 0.00   1st Qu.: 3.970   1st Qu.:0.0000  
##  Median :0.52014   Median : 0.00   Median : 6.200   Median :0.0000  
##  Mean   :0.71879   Mean   :13.62   Mean   : 7.078   Mean   :0.1538  
##  3rd Qu.:0.57834   3rd Qu.:20.00   3rd Qu.: 6.200   3rd Qu.:0.0000  
##  Max.   :3.47428   Max.   :95.00   Max.   :19.580   Max.   :1.0000  
##       nox               rm             age             dis       
##  Min.   :0.4161   Min.   :8.034   Min.   : 8.40   Min.   :1.801  
##  1st Qu.:0.5040   1st Qu.:8.247   1st Qu.:70.40   1st Qu.:2.288  
##  Median :0.5070   Median :8.297   Median :78.30   Median :2.894  
##  Mean   :0.5392   Mean   :8.349   Mean   :71.54   Mean   :3.430  
##  3rd Qu.:0.6050   3rd Qu.:8.398   3rd Qu.:86.50   3rd Qu.:3.652  
##  Max.   :0.7180   Max.   :8.780   Max.   :93.90   Max.   :8.907  
##       rad              tax           ptratio          lstat           medv     
##  Min.   : 2.000   Min.   :224.0   Min.   :13.00   Min.   :2.47   Min.   :21.9  
##  1st Qu.: 5.000   1st Qu.:264.0   1st Qu.:14.70   1st Qu.:3.32   1st Qu.:41.7  
##  Median : 7.000   Median :307.0   Median :17.40   Median :4.14   Median :48.3  
##  Mean   : 7.462   Mean   :325.1   Mean   :16.36   Mean   :4.31   Mean   :44.2  
##  3rd Qu.: 8.000   3rd Qu.:307.0   3rd Qu.:17.40   3rd Qu.:5.12   3rd Qu.:50.0  
##  Max.   :24.000   Max.   :666.0   Max.   :20.20   Max.   :7.44   Max.   :50.0