#Question 8
# Load the College dataset (update the path if necessary)
college <- read.csv("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/College.csv")
# Set row names as university names and remove the first column
rownames(college) <- college[, 1]
college <- college[, -1]
# Check data types and clean missing values
college$Private <- as.factor(college$Private)
college <- na.omit(college)
# Summary of the dataset
summary(college)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
# Scatterplot matrix of the first 10 variables (numeric only)
pairs(college[, sapply(college[, 1:10], is.numeric)],
main = "Scatterplot Matrix of First 10 Variables")
# Boxplots comparing Outstate tuition by Private/Public colleges
plot(college$Private, college$Outstate,
main = "Outstate Tuition by Private/Public",
xlab = "Private", ylab = "Outstate Tuition",
col = "lightblue")
# Create the Elite variable and summary
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] <- "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)
## No Yes
## 699 78
# Boxplots comparing Outstate tuition by Elite status
plot(college$Elite, college$Outstate,
main = "Outstate Tuition by Elite Status",
xlab = "Elite", ylab = "Outstate Tuition",
col = "lightgreen")
# Set up 2x2 plot layout and generate histograms for key variables
par(mfrow = c(2, 2))
hist(college$Apps, breaks = 15, col = "pink", main = "Applications")
hist(college$Accept, breaks = 15, col = "orange", main = "Acceptances")
hist(college$F.Undergrad, breaks = 15, col = "skyblue", main = "Full-Time Undergrads")
hist(college$Outstate, breaks = 15, col = "lightgreen", main = "Outstate Tuition")
#Question 9
# ---------------------------------------------
# Exercise 9: Auto Dataset Analysis
# ---------------------------------------------
# Load the Auto dataset (update the path as necessary)
auto <- read.table("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/Auto.data",
header = TRUE, na.strings = "?")
# Remove rows with missing values
auto <- na.omit(auto)
# View the structure of the data
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# ---------------------------------------------
# (a) Identify quantitative and qualitative predictors
# ---------------------------------------------
# Explanation: The `str()` function tells us the type of each column.
# From the structure:
# - Quantitative variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# - Qualitative variables: origin, name
# ---------------------------------------------
# (b) Range of each quantitative predictor
# ---------------------------------------------
# Calculate the range of all quantitative predictors using the range() function
sapply(auto[, sapply(auto, is.numeric)], range)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 9.0 3 68 46 1613 8.0 70 1
## [2,] 46.6 8 455 230 5140 24.8 82 3
# Explanation: The `range()` function returns the minimum and maximum value for each variable.
# ---------------------------------------------
# (c) Mean and standard deviation of quantitative predictors
# ---------------------------------------------
# Calculate the mean and standard deviation of all quantitative predictors
sapply(auto[, sapply(auto, is.numeric)], function(x) c(mean = mean(x), sd = sd(x)))
## mpg cylinders displacement horsepower weight acceleration
## mean 23.445918 5.471939 194.412 104.46939 2977.5842 15.541327
## sd 7.805007 1.705783 104.644 38.49116 849.4026 2.758864
## year origin
## mean 75.979592 1.5765306
## sd 3.683737 0.8055182
# Explanation: This shows the average value and variability of each predictor.
# ---------------------------------------------
# (d) Remove 10th through 85th observations and recompute statistics
# ---------------------------------------------
# Create a subset of the data without the 10th through 85th observations
auto_subset <- auto[-(10:85), ]
# Compute the range for the subset
range_subset <- sapply(auto_subset[, sapply(auto_subset, is.numeric)], range)
# Compute the mean and standard deviation for the subset
mean_sd_subset <- sapply(auto_subset[, sapply(auto_subset, is.numeric)], function(x) c(mean = mean(x), sd = sd(x)))
# Display results
range_subset
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0 3 68 46 1649 8.5 70 1
## [2,] 46.6 8 455 230 4997 24.8 82 3
mean_sd_subset
## mpg cylinders displacement horsepower weight acceleration
## mean 24.404430 5.373418 187.24051 100.72152 2935.9715 15.726899
## sd 7.867283 1.654179 99.67837 35.70885 811.3002 2.693721
## year origin
## mean 77.145570 1.601266
## sd 3.106217 0.819910
# ---------------------------------------------
# (e) Investigate the predictors graphically
# ---------------------------------------------
# Explanation: Scatterplot matrix to visualize relationships among key predictors
pairs(auto[, c("mpg", "displacement", "horsepower", "weight", "acceleration")],
main = "Scatterplot Matrix")
# Explanation: Scatterplots to examine specific relationships
# MPG vs Horsepower
plot(auto$horsepower, auto$mpg,
main = "MPG vs. Horsepower",
xlab = "Horsepower", ylab = "MPG", col = "blue", pch = 21)
# MPG vs Weight
plot(auto$weight, auto$mpg,
main = "MPG vs. Weight",
xlab = "Weight", ylab = "MPG", col = "green", pch = 21)
# ---------------------------------------------
# (f) Predicting mpg based on other variables
# ---------------------------------------------
# Explanation: From the scatterplots, we observe:
# - A strong negative relationship between `mpg` and `horsepower`
# - A negative correlation between `mpg` and `weight`
# - Displacement and acceleration also exhibit some correlation
# Conclusion: Variables like `horsepower`, `weight`, and `displacement` are likely good predictors for mpg.
#Summary of Results Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year Qualitative predictors: origin, name Range, mean, and standard deviation: Calculated for both full and subset data. Scatterplots: Show clear relationships, particularly the negative correlation of mpg with horsepower and weight ————————————————————-
#Question 10
# ---------------------------------------------
# Exercise 10: Boston Housing Dataset Analysis
# ---------------------------------------------
# (a) Load the Boston dataset
# Load the dataset using read.csv() with the correct path
boston <- read.csv("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/Boston.csv")
# View the structure of the dataset and count rows and columns
str(boston)
## 'data.frame': 506 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
cat("Number of rows:", nrow(boston), "\n")
## Number of rows: 506
cat("Number of columns:", ncol(boston), "\n")
## Number of columns: 14
# Explanation:
# - Rows represent 506 census tracts (neighborhoods) in Boston.
# - Columns represent 14 predictors including crime rate, housing values, and more.
# ---------------------------------------------
# (b) Pairwise scatterplots of predictors
# ---------------------------------------------
pairs(boston[, 1:10], main = "Scatterplot Matrix of First 10 Predictors")
# Explanation:
# - The scatterplot matrix helps visualize relationships between predictors.
# - For example, `rm` (average number of rooms) might show a positive relationship with `medv` (median home value),
# while `crim` (crime rate) might be negatively correlated with `dis` (distance to employment centers).
# ---------------------------------------------
# (c) Correlation with per capita crime rate (crim)
# ---------------------------------------------
correlations <- cor(boston)
correlations["crim", ]
## X crim zn indus chas nox
## 0.40740717 1.00000000 -0.20046922 0.40658341 -0.05589158 0.42097171
## rm age dis rad tax ptratio
## -0.21924670 0.35273425 -0.37967009 0.62550515 0.58276431 0.28994558
## lstat medv
## 0.45562148 -0.38830461
# Explanation:
# - This shows how crime rate (`crim`) correlates with other predictors.
# - Strong negative correlations with `dis` suggest that neighborhoods farther from employment centers tend to have less crime.
# - Positive correlations with variables like `ptratio` and `tax` suggest that areas with higher pupil-teacher ratios and tax rates
# may have higher crime.
# ---------------------------------------------
# (d) Summary and range of crime rates, tax rates, and pupil-teacher ratios
# ---------------------------------------------
cat("\nSummary of Crime Rates:\n")
##
## Summary of Crime Rates:
summary(boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
cat("\nSummary of Tax Rates:\n")
##
## Summary of Tax Rates:
summary(boston$tax)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
cat("\nSummary of Pupil-Teacher Ratios:\n")
##
## Summary of Pupil-Teacher Ratios:
summary(boston$ptratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
# Explanation:
# - This provides minimum, maximum, and median values for crime rates, tax rates, and pupil-teacher ratios.
# - Outliers can be detected, such as areas with high crime or tax rates.
# ---------------------------------------------
# (e) Census tracts that bound the Charles River
# ---------------------------------------------
charles_river_bound <- sum(boston$chas == 1)
cat("\nNumber of tracts bounding the Charles River:", charles_river_bound, "\n")
##
## Number of tracts bounding the Charles River: 35
# Explanation:
# - The `chas` variable indicates whether a census tract bounds the Charles River (1 = yes).
# - This output tells how many neighborhoods are along the river.
# ---------------------------------------------
# (f) Median pupil-teacher ratio
# ---------------------------------------------
median_ptratio <- median(boston$ptratio)
cat("\nMedian pupil-teacher ratio:", median_ptratio, "\n")
##
## Median pupil-teacher ratio: 19.05
# Explanation:
# - This calculates the median pupil-teacher ratio across all census tracts.
# ---------------------------------------------
# (g) Census tract with the lowest median value of owner-occupied homes
# ---------------------------------------------
min_medv_index <- which.min(boston$medv) # Find the index of the minimum median value
lowest_tract <- boston[min_medv_index, ]
cat("\nCensus tract with lowest median home value:\n")
##
## Census tract with lowest median home value:
print(lowest_tract)
## X crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
# Compare values of predictors for this tract with overall ranges
cat("\nOverall range of predictors:\n")
##
## Overall range of predictors:
sapply(boston, range)
## X crim zn indus chas nox rm age dis rad tax ptratio
## [1,] 1 0.00632 0 0.46 0 0.385 3.561 2.9 1.1296 1 187 12.6
## [2,] 506 88.97620 100 27.74 1 0.871 8.780 100.0 12.1265 24 711 22.0
## lstat medv
## [1,] 1.73 5
## [2,] 37.97 50
# Explanation:
# - This identifies the neighborhood with the lowest home values and compares its predictors (like crime rate and tax)
# with the overall dataset.
# ---------------------------------------------
# (h) Census tracts with more than 7 or 8 rooms per dwelling
# ---------------------------------------------
tracts_more_7_rooms <- sum(boston$rm > 7)
tracts_more_8_rooms <- sum(boston$rm > 8)
cat("\nNumber of census tracts with more than 7 rooms per dwelling:", tracts_more_7_rooms, "\n")
##
## Number of census tracts with more than 7 rooms per dwelling: 64
cat("Number of census tracts with more than 8 rooms per dwelling:", tracts_more_8_rooms, "\n")
## Number of census tracts with more than 8 rooms per dwelling: 13
# View details of tracts with more than 8 rooms per dwelling
cat("\nCensus tracts with more than 8 rooms:\n")
##
## Census tracts with more than 8 rooms:
print(boston[boston$rm > 8, ])
## X crim zn indus chas nox rm age dis rad tax ptratio lstat
## 98 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 4.21
## 164 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 3.32
## 205 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 2.88
## 225 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 4.14
## 226 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 4.63
## 227 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 3.13
## 233 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 2.47
## 234 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 3.95
## 254 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 3.54
## 258 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 5.12
## 263 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 5.91
## 268 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 7.44
## 365 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 5.29
## medv
## 98 38.7
## 164 50.0
## 205 50.0
## 225 44.8
## 226 50.0
## 227 37.6
## 233 41.7
## 234 48.3
## 254 42.8
## 258 50.0
## 263 48.8
## 268 50.0
## 365 21.9
# Explanation:
# - Neighborhoods with more than 8 rooms per dwelling are typically wealthier and have higher median home values (`medv`).
# - This output identifies which neighborhoods meet this condition and provides further details on them.