#1. set working directory
setwd("C:/Users/sharl/Desktop/USF/Fall 2021/LIS 4805 - Predictive Analytics/Week 7 - Midterm Project")

#2. download the college.csv data in your working directory. Read the data using read.csv function, and save it as data
library(readr)
data <- read_csv("College.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_character(),
##   Private = col_character(),
##   Apps = col_double(),
##   Accept = col_double(),
##   Enroll = col_double(),
##   Top10perc = col_double(),
##   Top25perc = col_double(),
##   F.Undergrad = col_double(),
##   P.Undergrad = col_double(),
##   Outstate = col_double(),
##   Room.Board = col_double(),
##   Books = col_double(),
##   Personal = col_double(),
##   PhD = col_double(),
##   Terminal = col_double(),
##   S.F.Ratio = col_double(),
##   perc.alumni = col_double(),
##   Expend = col_double(),
##   Grad.Rate = col_double()
## )
#3. print the first ten rows of the data. Explore the data.
head(data,10)
## # A tibble: 10 x 19
##    X1    Private  Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
##    <chr> <chr>   <dbl>  <dbl>  <dbl>     <dbl>     <dbl>       <dbl>       <dbl>
##  1 Abil~ Yes      1660   1232    721        23        52        2885         537
##  2 Adel~ Yes      2186   1924    512        16        29        2683        1227
##  3 Adri~ Yes      1428   1097    336        22        50        1036          99
##  4 Agne~ Yes       417    349    137        60        89         510          63
##  5 Alas~ Yes       193    146     55        16        44         249         869
##  6 Albe~ Yes       587    479    158        38        62         678          41
##  7 Albe~ Yes       353    340    103        17        45         416         230
##  8 Albi~ Yes      1899   1720    489        37        68        1594          32
##  9 Albr~ Yes      1038    839    227        30        63         973         306
## 10 Alde~ Yes       582    498    172        21        44         799          78
## # ... with 10 more variables: Outstate <dbl>, Room.Board <dbl>, Books <dbl>,
## #   Personal <dbl>, PhD <dbl>, Terminal <dbl>, S.F.Ratio <dbl>,
## #   perc.alumni <dbl>, Expend <dbl>, Grad.Rate <dbl>
#4. require libraries: dplyr, ISLR, boot, caret
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(ISLR)
## Loading required package: ISLR
require(boot)
## Loading required package: boot
require(caret)
## Loading required package: caret
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## Loading required package: ggplot2
#5. find out the details about College data. understand each variables
?College
## starting httpd help server ...
##  done
#6. drop private schools, and save it as "public". how many rows does public include?
public <-data[data$Private != "Yes",]
#Public contains 212 observations


#7. drop the first two columns because they are character values and we do not need them for linear modeling
public <-public[ ,-1:-2]


#8 set seed to "1"
set.seed(1)


#9. create train and test set. First, create set object by splitting 2/3 of the data based on Grad.Rate
set <-createDataPartition(public$Grad.Rate, p= 2/3, list=F)
nrow(set)
## [1] 143
# 10. create train and test set.
train <-public[set,]
test <-public[-set,]
#####################################################################################################################################
#11. fit regression model using the train set. Use one variable as a dependent variable, and use ALL the other variables 
#as independent variables throughout. In other words, do not conduct variable selection yet.
model <-lm(Grad.Rate ~., data=train)

#12. five fold cross validation because our sample size is small. 
#First, create ControlParameters. set method="repeatedcv", use five repeats
ControlParameters <-trainControl(method= "repeatedcv", 
                                 number=5,
                                 repeats=5)

#13. use train function for five fold cross validation. You must use the train dataset. 
modellm <-train(Grad.Rate ~., data = train,
                method="lm",
                trControl= ControlParameters)

#14. print the model on the console. What is the value of RMSE? 
#Save this value in a object called "RMSE_train"
RMSE_train <- modellm
RMSE_train
## Linear Regression 
## 
## 143 samples
##  16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE    
##   12.27555  0.4078312  9.50994
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
###Print summary of the modellm
summary(modellm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.913  -6.305  -0.334   6.374  38.500 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50.0300381 10.8502044   4.611  9.7e-06 ***
## Apps         0.0012788  0.0007342   1.742 0.084018 .  
## Accept      -0.0014398  0.0014797  -0.973 0.332395    
## Enroll       0.0046894  0.0036018   1.302 0.195307    
## Top10perc   -0.0014783  0.1296047  -0.011 0.990918    
## Top25perc    0.2008193  0.1017104   1.974 0.050521 .  
## F.Undergrad -0.0004705  0.0006400  -0.735 0.463646    
## P.Undergrad -0.0013013  0.0004749  -2.740 0.007030 ** 
## Outstate     0.0019362  0.0005637   3.435 0.000803 ***
## Room.Board   0.0003553  0.0014301   0.248 0.804188    
## Books       -0.0141352  0.0077622  -1.821 0.070975 .  
## Personal    -0.0043351  0.0015223  -2.848 0.005143 ** 
## PhD          0.1052924  0.1512471   0.696 0.487610    
## Terminal    -0.2461042  0.1454937  -1.692 0.093211 .  
## S.F.Ratio    0.1973321  0.3662775   0.539 0.591010    
## perc.alumni  0.3196940  0.1328844   2.406 0.017590 *  
## Expend      -0.0003110  0.0006452  -0.482 0.630680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.91 on 126 degrees of freedom
## Multiple R-squared:  0.5198, Adjusted R-squared:  0.4588 
## F-statistic: 8.525 on 16 and 126 DF,  p-value: 1.16e-13
#####################################################################################################################################
#15. predict on the new data (test data)
p <- predict(modellm, test)
p
##        1        2        3        4        5        6        7        8 
## 48.18019 59.60198 53.55878 53.55144 46.81897 45.12605 50.87809 62.30690 
##        9       10       11       12       13       14       15       16 
## 53.37525 43.17603 56.55543 38.35379 60.73397 52.42691 52.67701 77.75176 
##       17       18       19       20       21       22       23       24 
## 51.36309 50.02504 47.89318 64.86872 63.88854 66.77046 66.61606 51.73441 
##       25       26       27       28       29       30       31       32 
## 53.16560 52.31266 51.12853 66.26005 42.08696 74.13990 35.97242 73.72418 
##       33       34       35       36       37       38       39       40 
## 37.90832 48.35598 52.21865 54.20413 71.83962 57.62960 50.56116 54.64695 
##       41       42       43       44       45       46       47       48 
## 53.71841 71.64255 56.71228 82.01802 54.44534 46.21338 54.16526 60.10383 
##       49       50       51       52       53       54       55       56 
## 50.74132 62.49695 53.57364 94.87368 50.83840 64.38201 81.08365 44.29337 
##       57       58       59       60       61       62       63       64 
## 54.82068 40.21413 52.38911 46.39986 54.71972 52.96421 58.07081 52.25302 
##       65       66       67       68       69 
## 44.13874 43.61924 48.28823 61.12951 55.38423
#16. compute errors of the modellm for predicting on test data
error <- p - test$Grad.Rate

#17. calculate RMSE from the error you just calculated. Save this value in an object called "RMSE_test"
RMSE_test <- sqrt(mean(error^2))
RMSE_test
## [1] 12.26936
#18. compare RMSE_train and RMSE_test
RMSE_train
## Linear Regression 
## 
## 143 samples
##  16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE    
##   12.27555  0.4078312  9.50994
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE_test
## [1] 12.26936
##########################################################################################################

# 19. Predict on full "public" dataset. Is the full data RMSE lower than the test set result?
p_full <- predict(modellm, public)
error_full <- p_full - public$Grad.Rate
# 20. calculate RMSE from the full data and same the value as "RMSE_full"
RMSE_full <- sqrt(mean(error_full^2))
#21. Compare RMSE's from full data, train data, and test data. 
RMSE_train
## Linear Regression 
## 
## 143 samples
##  16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE    
##   12.27555  0.4078312  9.50994
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE_test
## [1] 12.26936
RMSE_full
## [1] 10.94097