alumni <- read.csv("C:/Users/Gokul/Downloads/alumni (1).csv")
head(alumni,5)
##                               school percent_of_classes_under_20
## 1                     Boston College                          39
## 2               Brandeis University                           68
## 3                   Brown University                          60
## 4 California Institute of Technology                          65
## 5         Carnegie Mellon University                          67
##   student_faculty_ratio alumni_giving_rate private
## 1                    13                 25       1
## 2                     8                 33       1
## 3                     8                 40       1
## 4                     3                 46       1
## 5                    10                 28       1
str(alumni)
## 'data.frame':    48 obs. of  5 variables:
##  $ school                     : chr  "Boston College" "Brandeis University " "Brown University" "California Institute of Technology" ...
##  $ percent_of_classes_under_20: int  39 68 60 65 67 52 45 69 72 61 ...
##  $ student_faculty_ratio      : int  13 8 8 3 10 8 12 7 13 10 ...
##  $ alumni_giving_rate         : int  25 33 40 46 28 31 27 31 35 53 ...
##  $ private                    : int  1 1 1 1 1 1 1 1 1 1 ...
summary(alumni)
##     school          percent_of_classes_under_20 student_faculty_ratio
##  Length:48          Min.   :29.00               Min.   : 3.00        
##  Class :character   1st Qu.:44.75               1st Qu.: 8.00        
##  Mode  :character   Median :59.50               Median :10.50        
##                     Mean   :55.73               Mean   :11.54        
##                     3rd Qu.:66.25               3rd Qu.:13.50        
##                     Max.   :77.00               Max.   :23.00        
##  alumni_giving_rate    private      
##  Min.   : 7.00      Min.   :0.0000  
##  1st Qu.:18.75      1st Qu.:0.0000  
##  Median :29.00      Median :1.0000  
##  Mean   :29.27      Mean   :0.6875  
##  3rd Qu.:38.50      3rd Qu.:1.0000  
##  Max.   :67.00      Max.   :1.0000
Y  <- alumni$alumni_giving_rate
X1 <- alumni$percent_of_classes_under_20
X2 <- alumni$student_faculty_ratio
X3 <- factor(alumni$private, levels = c(0,1),
             labels = c("Public","Private"))

m0 <- lm(Y ~ X1 + X2, data = alumni)

#multiple linear regression model
m1 <- lm(Y ~ X1 + X2 + X3, data = alumni)
summary(m1)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = alumni)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.757  -6.320  -2.273   5.152  25.669 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 36.78364   13.67220   2.690  0.01005 * 
## X1           0.07725    0.17873   0.432  0.66768   
## X2          -1.39835    0.51075  -2.738  0.00889 **
## X3Private    6.28534    5.35633   1.173  0.24693   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.06 on 44 degrees of freedom
## Multiple R-squared:  0.5747, Adjusted R-squared:  0.5457 
## F-statistic: 19.81 on 3 and 44 DF,  p-value: 2.818e-08
#simple linear regression model
summary(m0)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = alumni)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.00  -6.57  -1.95   4.42  24.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.6556    13.5076   2.936 0.005225 ** 
## X1            0.1662     0.1626   1.022 0.312128    
## X2           -1.7021     0.4421  -3.850 0.000371 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.098 on 45 degrees of freedom
## Multiple R-squared:  0.5613, Adjusted R-squared:  0.5418 
## F-statistic: 28.79 on 2 and 45 DF,  p-value: 8.869e-09

a. Plot the alumni giving rate vs either X1 or X2 in a scatter plot with private school(X3) represented in different colour

library(ggplot2)

ggplot(alumni, aes(x = X1, y = Y, color = X3)) +
  geom_point(size = 3) +
  labs(title = "Giving Rate vs X1 by Private/Public School",
       x = "X1: % Alumni Annual Fund", 
       y = "Giving Rate")

b. Does this new model differ from the one without the categorical model?

anova(m0, m1)
## Analysis of Variance Table
## 
## Model 1: Y ~ X1 + X2
## Model 2: Y ~ X1 + X2 + X3
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     45 3724.9                          
## 2     44 3611.8  1    113.03 1.377 0.2469

Explanation

The adjusted R squares is almost the same for both the models.

The ANOVA also shows that adding the categorical variable X3 (private vs. public school) does not significantly improve the model. The residual sum of squares is smaller when X3 is added, but the F-test for the additional term is not significant with F = 1.377 and p = 0.2469.

This suggests that, once X1 and X2 are in the model, type of school does not account for a significant additional proportion of variation in the response, alumni giving rate. Thus, the new model is not significantly different from the simpler one without the categorical variable.

c Add transformations of the Y to your model. Do you see any improvement in model performance or specifications? explain your reasoning.

alumni$Y_1 <- sqrt(alumni$alumni_giving_rate)
m3_sqrt <- lm(Y_1 ~ X1 + X2 + X3, data = alumni)
summary(m3_sqrt)
## 
## Call:
## lm(formula = Y_1 ~ X1 + X2 + X3, data = alumni)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7273 -0.5441 -0.1960  0.5932  1.7554 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.068862   1.239438   4.896 1.36e-05 ***
## X1           0.003665   0.016203   0.226  0.82210    
## X2          -0.134842   0.046301  -2.912  0.00562 ** 
## X3Private    0.786074   0.485573   1.619  0.11262    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8213 on 44 degrees of freedom
## Multiple R-squared:  0.6212, Adjusted R-squared:  0.5953 
## F-statistic: 24.05 on 3 and 44 DF,  p-value: 2.288e-09
alumni$Y_2 <- log(alumni$alumni_giving_rate)
m4_log <- lm(Y_2 ~ X1 + X2 + X3, data = alumni)
summary(m4_log)
## 
## Call:
## lm(formula = Y_2 ~ X1 + X2 + X3, data = alumni)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72938 -0.19240 -0.06253  0.24457  0.59591 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.627588   0.500437   7.249 4.94e-09 ***
## X1          -0.000158   0.006542  -0.024   0.9808    
## X2          -0.054849   0.018695  -2.934   0.0053 ** 
## X3Private    0.389736   0.196056   1.988   0.0531 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3316 on 44 degrees of freedom
## Multiple R-squared:  0.6383, Adjusted R-squared:  0.6136 
## F-statistic: 25.88 on 3 and 44 DF,  p-value: 8.375e-10

The model performance for log transformation is significantly better than the others because of the better adjusted R square being 0.61 and it makes the X3 value significant

The model performance for square root transformation is also better than the one with the predictors because the model performance is higher which means it has better adjusted R square than the original model which also makes it statistically significant.

Among the 3 of the models, log fits the best because the model performance is higher compared to all the other models.