#Question 8

# Load the College dataset (update the path if necessary)
college <- read.csv("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/College.csv")

# Set row names as university names and remove the first column
rownames(college) <- college[, 1]
college <- college[, -1]

# Check data types and clean missing values
college$Private <- as.factor(college$Private)
college <- na.omit(college)

# Summary of the dataset
summary(college)

##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00

# Scatterplot matrix of the first 10 variables (numeric only)
pairs(college[, sapply(college[, 1:10], is.numeric)], 
      main = "Scatterplot Matrix of First 10 Variables")

# Boxplots comparing Outstate tuition by Private/Public colleges
plot(college$Private, college$Outstate, 
     main = "Outstate Tuition by Private/Public", 
     xlab = "Private", ylab = "Outstate Tuition", 
     col = "lightblue")

# Create the Elite variable and summary
Elite <- rep("No", nrow(college))
Elite[college$Top10perc > 50] <- "Yes"
Elite <- as.factor(Elite)
college <- data.frame(college, Elite)
summary(Elite)

##  No Yes 
## 699  78

# Boxplots comparing Outstate tuition by Elite status
plot(college$Elite, college$Outstate, 
     main = "Outstate Tuition by Elite Status", 
     xlab = "Elite", ylab = "Outstate Tuition", 
     col = "lightgreen")

# Set up 2x2 plot layout and generate histograms for key variables
par(mfrow = c(2, 2))
hist(college$Apps, breaks = 15, col = "pink", main = "Applications")
hist(college$Accept, breaks = 15, col = "orange", main = "Acceptances")
hist(college$F.Undergrad, breaks = 15, col = "skyblue", main = "Full-Time Undergrads")
hist(college$Outstate, breaks = 15, col = "lightgreen", main = "Outstate Tuition")

———————————————————–

Summary of Potential Findings

———————————————————–

1. Private vs. Public Tuition: Private colleges have higher out-of-state

tuition compared to public colleges.

2. Elite Colleges: Colleges with more top-performing students (Elite = Yes)

tend to charge higher tuition.

3. Applications and Enrollment: There’s significant variation in the

number of applications and full-time students across institutions.

#Question 9

# ---------------------------------------------
# Exercise 9: Auto Dataset Analysis
# ---------------------------------------------

# Load the Auto dataset (update the path as necessary)
auto <- read.table("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/Auto.data", 
                   header = TRUE, na.strings = "?")

# Remove rows with missing values
auto <- na.omit(auto)

# View the structure of the data
str(auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

# ---------------------------------------------
# (a) Identify quantitative and qualitative predictors
# ---------------------------------------------

# Explanation: The `str()` function tells us the type of each column.
# From the structure:
# - Quantitative variables: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# - Qualitative variables: origin, name

# ---------------------------------------------
# (b) Range of each quantitative predictor
# ---------------------------------------------
# Calculate the range of all quantitative predictors using the range() function
sapply(auto[, sapply(auto, is.numeric)], range)

##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3

# Explanation: The `range()` function returns the minimum and maximum value for each variable.

# ---------------------------------------------
# (c) Mean and standard deviation of quantitative predictors
# ---------------------------------------------
# Calculate the mean and standard deviation of all quantitative predictors
sapply(auto[, sapply(auto, is.numeric)], function(x) c(mean = mean(x), sd = sd(x)))

##            mpg cylinders displacement horsepower    weight acceleration
## mean 23.445918  5.471939      194.412  104.46939 2977.5842    15.541327
## sd    7.805007  1.705783      104.644   38.49116  849.4026     2.758864
##           year    origin
## mean 75.979592 1.5765306
## sd    3.683737 0.8055182

# Explanation: This shows the average value and variability of each predictor.

# ---------------------------------------------
# (d) Remove 10th through 85th observations and recompute statistics
# ---------------------------------------------
# Create a subset of the data without the 10th through 85th observations
auto_subset <- auto[-(10:85), ]

# Compute the range for the subset
range_subset <- sapply(auto_subset[, sapply(auto_subset, is.numeric)], range)

# Compute the mean and standard deviation for the subset
mean_sd_subset <- sapply(auto_subset[, sapply(auto_subset, is.numeric)], function(x) c(mean = mean(x), sd = sd(x)))

# Display results
range_subset

##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0         3           68         46   1649          8.5   70      1
## [2,] 46.6         8          455        230   4997         24.8   82      3

mean_sd_subset

##            mpg cylinders displacement horsepower    weight acceleration
## mean 24.404430  5.373418    187.24051  100.72152 2935.9715    15.726899
## sd    7.867283  1.654179     99.67837   35.70885  811.3002     2.693721
##           year   origin
## mean 77.145570 1.601266
## sd    3.106217 0.819910

# ---------------------------------------------
# (e) Investigate the predictors graphically
# ---------------------------------------------

# Explanation: Scatterplot matrix to visualize relationships among key predictors
pairs(auto[, c("mpg", "displacement", "horsepower", "weight", "acceleration")], 
      main = "Scatterplot Matrix")

# Explanation: Scatterplots to examine specific relationships
# MPG vs Horsepower
plot(auto$horsepower, auto$mpg, 
     main = "MPG vs. Horsepower", 
     xlab = "Horsepower", ylab = "MPG", col = "blue", pch = 21)

# MPG vs Weight
plot(auto$weight, auto$mpg, 
     main = "MPG vs. Weight", 
     xlab = "Weight", ylab = "MPG", col = "green", pch = 21)

# ---------------------------------------------
# (f) Predicting mpg based on other variables
# ---------------------------------------------

# Explanation: From the scatterplots, we observe:
# - A strong negative relationship between `mpg` and `horsepower`
# - A negative correlation between `mpg` and `weight`
# - Displacement and acceleration also exhibit some correlation

# Conclusion: Variables like `horsepower`, `weight`, and `displacement` are likely good predictors for mpg.

#Summary of Results Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year Qualitative predictors: origin, name Range, mean, and standard deviation: Calculated for both full and subset data. Scatterplots: Show clear relationships, particularly the negative correlation of mpg with horsepower and weight ————————————————————-

#Question 10

# ---------------------------------------------
# Exercise 10: Boston Housing Dataset Analysis
# ---------------------------------------------

# (a) Load the Boston dataset
# Load the dataset using read.csv() with the correct path
boston <- read.csv("C:/Users/apoor/OneDrive/Desktop/Master of Finance/Term 3/SuperForecasting/Boston.csv")

# View the structure of the dataset and count rows and columns
str(boston)

## 'data.frame':    506 obs. of  14 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

cat("Number of rows:", nrow(boston), "\n")

## Number of rows: 506

cat("Number of columns:", ncol(boston), "\n")

## Number of columns: 14

# Explanation:
# - Rows represent 506 census tracts (neighborhoods) in Boston.
# - Columns represent 14 predictors including crime rate, housing values, and more.

# ---------------------------------------------
# (b) Pairwise scatterplots of predictors
# ---------------------------------------------
pairs(boston[, 1:10], main = "Scatterplot Matrix of First 10 Predictors")

# Explanation:
# - The scatterplot matrix helps visualize relationships between predictors.
# - For example, `rm` (average number of rooms) might show a positive relationship with `medv` (median home value),
#   while `crim` (crime rate) might be negatively correlated with `dis` (distance to employment centers).

# ---------------------------------------------
# (c) Correlation with per capita crime rate (crim)
# ---------------------------------------------
correlations <- cor(boston)
correlations["crim", ]

##           X        crim          zn       indus        chas         nox 
##  0.40740717  1.00000000 -0.20046922  0.40658341 -0.05589158  0.42097171 
##          rm         age         dis         rad         tax     ptratio 
## -0.21924670  0.35273425 -0.37967009  0.62550515  0.58276431  0.28994558 
##       lstat        medv 
##  0.45562148 -0.38830461

# Explanation:
# - This shows how crime rate (`crim`) correlates with other predictors.
# - Strong negative correlations with `dis` suggest that neighborhoods farther from employment centers tend to have less crime.
# - Positive correlations with variables like `ptratio` and `tax` suggest that areas with higher pupil-teacher ratios and tax rates 
#   may have higher crime.

# ---------------------------------------------
# (d) Summary and range of crime rates, tax rates, and pupil-teacher ratios
# ---------------------------------------------
cat("\nSummary of Crime Rates:\n")

## 
## Summary of Crime Rates:

summary(boston$crim)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620

cat("\nSummary of Tax Rates:\n")

## 
## Summary of Tax Rates:

summary(boston$tax)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0

cat("\nSummary of Pupil-Teacher Ratios:\n")

## 
## Summary of Pupil-Teacher Ratios:

summary(boston$ptratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00

# Explanation:
# - This provides minimum, maximum, and median values for crime rates, tax rates, and pupil-teacher ratios.
# - Outliers can be detected, such as areas with high crime or tax rates.

# ---------------------------------------------
# (e) Census tracts that bound the Charles River
# ---------------------------------------------
charles_river_bound <- sum(boston$chas == 1)
cat("\nNumber of tracts bounding the Charles River:", charles_river_bound, "\n")

## 
## Number of tracts bounding the Charles River: 35

# Explanation:
# - The `chas` variable indicates whether a census tract bounds the Charles River (1 = yes).
# - This output tells how many neighborhoods are along the river.

# ---------------------------------------------
# (f) Median pupil-teacher ratio
# ---------------------------------------------
median_ptratio <- median(boston$ptratio)
cat("\nMedian pupil-teacher ratio:", median_ptratio, "\n")

## 
## Median pupil-teacher ratio: 19.05

# Explanation:
# - This calculates the median pupil-teacher ratio across all census tracts.

# ---------------------------------------------
# (g) Census tract with the lowest median value of owner-occupied homes
# ---------------------------------------------
min_medv_index <- which.min(boston$medv)  # Find the index of the minimum median value
lowest_tract <- boston[min_medv_index, ]

cat("\nCensus tract with lowest median home value:\n")

## 
## Census tract with lowest median home value:

print(lowest_tract)

##       X    crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5

# Compare values of predictors for this tract with overall ranges
cat("\nOverall range of predictors:\n")

## 
## Overall range of predictors:

sapply(boston, range)

##        X     crim  zn indus chas   nox    rm   age     dis rad tax ptratio
## [1,]   1  0.00632   0  0.46    0 0.385 3.561   2.9  1.1296   1 187    12.6
## [2,] 506 88.97620 100 27.74    1 0.871 8.780 100.0 12.1265  24 711    22.0
##      lstat medv
## [1,]  1.73    5
## [2,] 37.97   50

# Explanation:
# - This identifies the neighborhood with the lowest home values and compares its predictors (like crime rate and tax)
#   with the overall dataset.

# ---------------------------------------------
# (h) Census tracts with more than 7 or 8 rooms per dwelling
# ---------------------------------------------
tracts_more_7_rooms <- sum(boston$rm > 7)
tracts_more_8_rooms <- sum(boston$rm > 8)

cat("\nNumber of census tracts with more than 7 rooms per dwelling:", tracts_more_7_rooms, "\n")

## 
## Number of census tracts with more than 7 rooms per dwelling: 64

cat("Number of census tracts with more than 8 rooms per dwelling:", tracts_more_8_rooms, "\n")

## Number of census tracts with more than 8 rooms per dwelling: 13

# View details of tracts with more than 8 rooms per dwelling
cat("\nCensus tracts with more than 8 rooms:\n")

## 
## Census tracts with more than 8 rooms:

print(boston[boston$rm > 8, ])

##       X    crim zn indus chas    nox    rm  age    dis rad tax ptratio lstat
## 98   98 0.12083  0  2.89    0 0.4450 8.069 76.0 3.4952   2 276    18.0  4.21
## 164 164 1.51902  0 19.58    1 0.6050 8.375 93.9 2.1620   5 403    14.7  3.32
## 205 205 0.02009 95  2.68    0 0.4161 8.034 31.9 5.1180   4 224    14.7  2.88
## 225 225 0.31533  0  6.20    0 0.5040 8.266 78.3 2.8944   8 307    17.4  4.14
## 226 226 0.52693  0  6.20    0 0.5040 8.725 83.0 2.8944   8 307    17.4  4.63
## 227 227 0.38214  0  6.20    0 0.5040 8.040 86.5 3.2157   8 307    17.4  3.13
## 233 233 0.57529  0  6.20    0 0.5070 8.337 73.3 3.8384   8 307    17.4  2.47
## 234 234 0.33147  0  6.20    0 0.5070 8.247 70.4 3.6519   8 307    17.4  3.95
## 254 254 0.36894 22  5.86    0 0.4310 8.259  8.4 8.9067   7 330    19.1  3.54
## 258 258 0.61154 20  3.97    0 0.6470 8.704 86.9 1.8010   5 264    13.0  5.12
## 263 263 0.52014 20  3.97    0 0.6470 8.398 91.5 2.2885   5 264    13.0  5.91
## 268 268 0.57834 20  3.97    0 0.5750 8.297 67.0 2.4216   5 264    13.0  7.44
## 365 365 3.47428  0 18.10    1 0.7180 8.780 82.9 1.9047  24 666    20.2  5.29
##     medv
## 98  38.7
## 164 50.0
## 205 50.0
## 225 44.8
## 226 50.0
## 227 37.6
## 233 41.7
## 234 48.3
## 254 42.8
## 258 50.0
## 263 48.8
## 268 50.0
## 365 21.9

# Explanation:
# - Neighborhoods with more than 8 rooms per dwelling are typically wealthier and have higher median home values (`medv`).
# - This output identifies which neighborhoods meet this condition and provides further details on them.

———————————————

Acknowledgements and Citations

———————————————

I took help from ChatGPT for guidance on certain parts of this code, including:

- Understanding the correlation analysis

- Generating scatterplot matrices

- Structuring and organizing code for summaries

For detailed understanding, I referred to Chapter 2.3 of the book:

“An Introduction to Statistical Learning” (ISLR2, corrected June 2023 version).

Assignment 1 by Apoorva Shanker

2025-01-30

———————————————————–

Summary of Potential Findings

———————————————————–

1. Private vs. Public Tuition: Private colleges have higher out-of-state

tuition compared to public colleges.

2. Elite Colleges: Colleges with more top-performing students (Elite = Yes)

tend to charge higher tuition.

3. Applications and Enrollment: There’s significant variation in the

number of applications and full-time students across institutions.

———————————————

Acknowledgements and Citations

———————————————

I took help from ChatGPT for guidance on certain parts of this code, including:

- Understanding the correlation analysis

- Generating scatterplot matrices

- Structuring and organizing code for summaries

For detailed understanding, I referred to Chapter 2.3 of the book:

“An Introduction to Statistical Learning” (ISLR2, corrected June 2023 version).